|
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After AutoInputConversionPipeline (iree-auto-input-conversion) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IREEImportPublic (iree-import-public) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ConvertMeshToFlow (iree-convert-mesh-to-flow) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = call @_matmul_dynamic(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%c0 = arith.constant 0 : index |
|
%dim = tensor.dim %9, %c0 : tensor<?x?xf32> |
|
%c1 = arith.constant 1 : index |
|
%dim_0 = tensor.dim %9, %c1 : tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%dim, %dim_0} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
func.func private @_matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func private @_matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { |
|
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
return %0 : tensor<?x?xf32> |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1 = arith.constant 1 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = call @_matmul_dynamic(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%dim = tensor.dim %9, %c0 : tensor<?x?xf32> |
|
%dim_0 = tensor.dim %9, %c1 : tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%dim, %dim_0} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Inliner (inline) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After RemoveZeroExtentTensors (iree-global-opt-remove-zero-extent-tensors) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After DetachElementwiseFromNamedOps (iree-global-opt-detach-elementwise-from-named-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After EraseUnusedLinalgOperands (iree-global-opt-erase-unused-linalg-operands) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ExpandTensorShapes (iree-global-opt-expand-tensor-shapes) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After DecomposeConcat (iree-global-opt-decompose-concat) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FuseDequantizationMatmul (iree-global-opt-fuse-dequantization-matmul) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32> |
|
%10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %10 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SetEncoding (iree-global-opt-set-encoding) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<(d0, d1, d2) -> (d0, d2)> |
|
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> |
|
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> |
|
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%c1 = arith.constant 1 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index |
|
%dim = tensor.dim %2, %c0 : tensor<?x?xf32> |
|
%10 = affine.apply #map3()[%9#0, %dim] |
|
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32> |
|
%11 = affine.apply #map3()[%9#1, %dim_0] |
|
%padded = tensor.pad %2 low[0, 0] high[%10, %11] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%12 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> |
|
%13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index |
|
%dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32> |
|
%14 = affine.apply #map3()[%13#0, %dim_1] |
|
%dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32> |
|
%15 = affine.apply #map3()[%13#1, %dim_2] |
|
%padded_3 = tensor.pad %5 low[0, 0] high[%14, %15] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%16 = iree_linalg_ext.set_encoding %padded_3 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> |
|
%17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index |
|
%dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%18 = affine.apply #map3()[%17#0, %dim_4] |
|
%dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%19 = affine.apply #map3()[%17#1, %dim_5] |
|
%padded_6 = tensor.pad %8 low[0, 0] high[%18, %19] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%20 = iree_linalg_ext.set_encoding %padded_6 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> |
|
%21 = linalg.matmul ins(%12, %16 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%20 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> |
|
%dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%22 = iree_linalg_ext.unset_encoding %21 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32> |
|
%extracted_slice = tensor.extract_slice %22[0, 0] [%dim_7, %dim_8] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
|
%23 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %23 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After CPUMaterializeUpperBoundTileSize (iree-codegen-cpu-materialize-upper-bound-tile-size) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c16 = arith.constant 16 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%c1 = arith.constant 1 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%dim = tensor.dim %2, %c0 : tensor<?x?xf32> |
|
%9 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim] |
|
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32> |
|
%10 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c1, %dim_0] |
|
%padded = tensor.pad %2 low[0, 0] high[%9, %10] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%11 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> |
|
%dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32> |
|
%12 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c1, %dim_1] |
|
%dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32> |
|
%13 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_2] |
|
%padded_3 = tensor.pad %5 low[0, 0] high[%12, %13] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%14 = iree_linalg_ext.set_encoding %padded_3 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> |
|
%dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%15 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_4] |
|
%dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%16 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_5] |
|
%padded_6 = tensor.pad %8 low[0, 0] high[%15, %16] { |
|
^bb0(%arg3: index, %arg4: index): |
|
tensor.yield %cst : f32 |
|
} : tensor<?x?xf32> to tensor<?x?xf32> |
|
%17 = iree_linalg_ext.set_encoding %padded_6 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> |
|
%18 = linalg.matmul ins(%11, %14 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%17 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> |
|
%dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%19 = iree_linalg_ext.unset_encoding %18 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32> |
|
%extracted_slice = tensor.extract_slice %19[0, 0] [%dim_7, %dim_8] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
|
%20 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %20 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CPUMaterializeEncoding (iree-codegen-cpu-materialize-encoding) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%c1 = arith.constant 1 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%dim = tensor.dim %2, %c0 : tensor<?x?xf32> |
|
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32> |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim] |
|
%10 = tensor.empty(%9, %dim_0) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32> |
|
%dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_2] |
|
%12 = tensor.empty(%11, %dim_1) : tensor<?x?x16x1xf32> |
|
%pack_3 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_4] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_5] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_6 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_3 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_6 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%17 = tensor.empty(%dim_7, %dim_8) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After MaterializeHomogeneousEncodings (iree-global-opt-materialize-homogeneous-encodings) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%c1 = arith.constant 1 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%dim = tensor.dim %2, %c0 : tensor<?x?xf32> |
|
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32> |
|
%9 = affine.apply #map()[%dim] |
|
%10 = tensor.empty(%9, %dim_0) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32> |
|
%dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32> |
|
%11 = affine.apply #map()[%dim_2] |
|
%12 = tensor.empty(%11, %dim_1) : tensor<?x?x16x1xf32> |
|
%pack_3 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%13 = affine.apply #map()[%dim_4] |
|
%14 = affine.apply #map()[%dim_5] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_6 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_3 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_6 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32> |
|
%dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32> |
|
%17 = tensor.empty(%dim_7, %dim_8) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After SimplifyPackUnpack (iree-global-opt-simplify-pack-unpack) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After JitGlobals (iree-consteval-jit-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply #map()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FormScalarDispatches (iree-flow-form-scalar-dispatches) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32> |
|
%pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32> |
|
%pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
%16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
%17 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
%18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%c0 = arith.constant 0 : index |
|
%c1 = arith.constant 1 : index |
|
%11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) { |
|
%pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32> |
|
%c0_0 = arith.constant 0 : index |
|
%c1_1 = arith.constant 1 : index |
|
%14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) { |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %13 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32> |
|
%c0_2 = arith.constant 0 : index |
|
%c1_3 = arith.constant 1 : index |
|
%c0_4 = arith.constant 0 : index |
|
%c1_5 = arith.constant 1 : index |
|
%18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%pack = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.return %pack : tensor<?x?x16x16xf32> |
|
} |
|
%19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.return %23 : tensor<?x?x16x16xf32> |
|
} |
|
%20 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%c0_6 = arith.constant 0 : index |
|
%c1_7 = arith.constant 1 : index |
|
%21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) { |
|
%unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %20 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.return %unpack : tensor<?x?xf32> |
|
} |
|
%22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %22 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CloneProducersIntoDispatchRegions (iree-flow-clone-producers-into-dispatch-regions) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%c0 = arith.constant 0 : index |
|
%c1 = arith.constant 1 : index |
|
%11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%24 = tensor.empty(%23, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32> |
|
%c0_0 = arith.constant 0 : index |
|
%c1_1 = arith.constant 1 : index |
|
%14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%24 = tensor.empty(%23, %3) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst_8 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32> |
|
%c0_2 = arith.constant 0 : index |
|
%c1_3 = arith.constant 1 : index |
|
%c0_4 = arith.constant 0 : index |
|
%c1_5 = arith.constant 1 : index |
|
%18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%25 = tensor.empty(%24, %23) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %8 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %25 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.return %pack : tensor<?x?x16x16xf32> |
|
} |
|
%19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.return %23 : tensor<?x?x16x16xf32> |
|
} |
|
%20 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%c0_6 = arith.constant 0 : index |
|
%c1_7 = arith.constant 1 : index |
|
%21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) { |
|
%23 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %23 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.return %unpack : tensor<?x?xf32> |
|
} |
|
%22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %22 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32> |
|
%c0 = arith.constant 0 : index |
|
%c1 = arith.constant 1 : index |
|
%11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%24 = tensor.empty(%23, %1) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %2 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32> |
|
%c0_0 = arith.constant 0 : index |
|
%c1_1 = arith.constant 1 : index |
|
%14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%24 = tensor.empty(%23, %3) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst_8 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.return %pack : tensor<?x?x16x1xf32> |
|
} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32> |
|
%c0_2 = arith.constant 0 : index |
|
%c1_3 = arith.constant 1 : index |
|
%c0_4 = arith.constant 0 : index |
|
%c1_5 = arith.constant 1 : index |
|
%18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%cst_8 = arith.constant 0.000000e+00 : f32 |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%25 = tensor.empty(%24, %23) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %8 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %25 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.return %pack : tensor<?x?x16x16xf32> |
|
} |
|
%19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) { |
|
%23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.return %23 : tensor<?x?x16x16xf32> |
|
} |
|
%20 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%c0_6 = arith.constant 0 : index |
|
%c1_7 = arith.constant 1 : index |
|
%21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) { |
|
%23 = tensor.empty(%6, %7) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %23 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.return %unpack : tensor<?x?xf32> |
|
} |
|
%22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %22 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%22 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32> |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%24 = tensor.empty(%23, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %22 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %arg7, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%22 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32> |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%24 = tensor.empty(%23, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %22 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %arg7, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%23 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32> |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20] |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %24) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %23 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %arg8, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%23 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%24 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%25 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32> |
|
%26 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32> |
|
%27 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32> |
|
%28 = linalg.mmt4d ins(%25, %26 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%27 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %28, %arg5, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32> |
|
%24 = tensor.empty(%21, %22) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %23 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %24 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %arg8, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg4, %arg6} |
|
%20 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%arg5, %arg6} |
|
%21 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%22 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%23 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%24 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%21, %23], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %23} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21] |
|
%26 = tensor.empty(%25, %23) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%22, %23} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg6, %arg4} |
|
%20 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%arg5, %arg6} |
|
%21 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%22 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%23 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%24 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%23, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%23, %21} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21] |
|
%26 = tensor.empty(%25, %23) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%22, %23} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg4, %arg5} |
|
%20 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%arg6, %arg7} |
|
%21 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%22 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%23 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%24 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%25 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %22} -> tensor<?x?xf32> |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%22] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21] |
|
%28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%arg6, %arg7} |
|
%20 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%arg8, %arg9} |
|
%21 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%arg10, %arg11} |
|
%22 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%23 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%24 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%25 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%26 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%27 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%28 = flow.dispatch.tensor.load %19, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%22, %23} -> tensor<?x?x16x1xf32> |
|
%29 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [%24, %25, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%24, %25} -> tensor<?x?x16x1xf32> |
|
%30 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0, 0], sizes = [%26, %27, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%26, %27} -> tensor<?x?x16x16xf32> |
|
%31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %31, %21, offsets = [0, 0, 0, 0], sizes = [%26, %27, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%26, %27} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%arg4, %arg5} |
|
%20 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg6, %arg7} |
|
%21 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%22 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%23 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%24 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%25 = flow.dispatch.tensor.load %19, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%21, %22} -> tensor<?x?x16x16xf32> |
|
%26 = tensor.empty(%23, %24) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %20, offsets = [0, 0], sizes = [%23, %24], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32> |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%23 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%24 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} |
|
%26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} |
|
%27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
%28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32> |
|
%29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32> |
|
%30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32> |
|
%31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32> |
|
%26 = tensor.empty(%21, %22) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32> |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%23 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%24 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} |
|
%26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} |
|
%27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
%28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32> |
|
%29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32> |
|
%30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32> |
|
%31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32> |
|
%26 = tensor.empty(%21, %22) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32> |
|
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32> |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19] |
|
%28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%23 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%24 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} |
|
%26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} |
|
%27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
%28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32> |
|
%29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32> |
|
%30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32> |
|
%31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32> |
|
%26 = tensor.empty(%21, %22) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After OutlineDispatchExterns (iree-flow-outline-dispatch-externs) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32> |
|
%25 = affine.apply #map()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} |
|
%23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
%24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32> |
|
%25 = affine.apply #map()[%19] |
|
%26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32> |
|
%26 = affine.apply #map()[%20] |
|
%27 = affine.apply #map()[%19] |
|
%28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { |
|
%19 = flow.dispatch.workload.ordinal %arg6, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg7, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg8, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg9, 3 : index |
|
%23 = flow.dispatch.workload.ordinal %arg10, 4 : index |
|
%24 = flow.dispatch.workload.ordinal %arg11, 5 : index |
|
%25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} |
|
%26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} |
|
%27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
%28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32> |
|
%29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32> |
|
%30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32> |
|
%31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = |
|
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%19 = flow.dispatch.workload.ordinal %arg4, 0 : index |
|
%20 = flow.dispatch.workload.ordinal %arg5, 1 : index |
|
%21 = flow.dispatch.workload.ordinal %arg6, 2 : index |
|
%22 = flow.dispatch.workload.ordinal %arg7, 3 : index |
|
%23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} |
|
%24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
%25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32> |
|
%26 = tensor.empty(%21, %22) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22} |
|
flow.return |
|
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After AnnotateDispatches (iree-flow-annotate-dispatches) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1] |
|
%8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1] |
|
%8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1] |
|
%8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After OutlineConstants (iree-util-outline-constants) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
flow.executable private @matmul_dynamic_dispatch_0 { |
|
flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_1 { |
|
flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_2 { |
|
flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) { |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_3 { |
|
flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
flow.executable private @matmul_dynamic_dispatch_4 { |
|
flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
flow.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} |
|
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} |
|
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} |
|
%9 = affine.apply #map()[%0] |
|
%10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} |
|
%11 = affine.apply #map()[%4] |
|
%12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} |
|
%13 = affine.apply #map()[%6] |
|
%14 = affine.apply #map()[%7] |
|
%15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} |
|
%16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} |
|
%17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} |
|
%18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view |
|
return %18 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%element_type_f32_0 = hal.element_type<f32> : i32 |
|
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%element_type_f32_2 = hal.element_type<f32> : i32 |
|
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%c0 = arith.constant 0 : index |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%c0_4 = arith.constant 0 : index |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0_4 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%c0_5 = arith.constant 0 : index |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0_5 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%c0_6 = arith.constant 0 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0_6 to %16 for %16], %20[%c0_6 to %19 for %19], %24[%c0_6 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%c0_7 = arith.constant 0 : index |
|
%26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0_7 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26} |
|
%28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26} |
|
%29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view |
|
return %29 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%element_type_f32_0 = hal.element_type<f32> : i32 |
|
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%element_type_f32_2 = hal.element_type<f32> : i32 |
|
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%c0 = arith.constant 0 : index |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%c0_4 = arith.constant 0 : index |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0_4 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%c0_5 = arith.constant 0 : index |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0_5 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%c0_6 = arith.constant 0 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0_6 to %16 for %16], %20[%c0_6 to %19 for %19], %24[%c0_6 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%c0_7 = arith.constant 0 : index |
|
%26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0_7 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26} |
|
%28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26} |
|
%29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view |
|
return %29 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
%element_type_f32_0 = hal.element_type<f32> : i32 |
|
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
%element_type_f32_2 = hal.element_type<f32> : i32 |
|
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26} |
|
%28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26} |
|
%29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view |
|
return %29 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index |
|
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} |
|
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index |
|
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} |
|
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index |
|
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} |
|
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} |
|
%15 = affine.apply #map()[%0] |
|
%16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index |
|
%17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16} |
|
%18 = affine.apply #map()[%6] |
|
%19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index |
|
%20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19} |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index |
|
%24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23} |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23} |
|
%26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12} |
|
%27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12} |
|
%28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view |
|
return %28 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1] |
|
%8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
|
|
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply #map()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply #map()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply #map()[%12] |
|
%27 = affine.apply #map()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major) |
|
%8 = arith.muli %6, %c4 : index |
|
%9 = arith.muli %8, %7 : index |
|
%10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9} |
|
%11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9} |
|
%12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major) |
|
%14 = arith.muli %12, %c4 : index |
|
%15 = arith.muli %14, %13 : index |
|
%16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} |
|
%17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15} |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %1 : index |
|
%21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20} |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7] |
|
%23 = arith.muli %22, %c64 : index |
|
%24 = arith.muli %23, %6 : index |
|
%25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24} |
|
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12] |
|
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13] |
|
%28 = arith.muli %26, %c1024 : index |
|
%29 = arith.muli %28, %27 : index |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15} |
|
%33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15} |
|
%34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view |
|
return %34 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%19 = affine.apply #map()[%6] |
|
%20 = arith.muli %19, %c64 : index |
|
%21 = arith.muli %20, %5 : index |
|
%22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21} |
|
%23 = affine.apply #map()[%10] |
|
%24 = affine.apply #map()[%11] |
|
%25 = arith.muli %23, %c1024 : index |
|
%26 = arith.muli %25, %24 : index |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg3[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg4[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg5[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27[%c0 to %17 for %17], %28[%c0 to %20 for %20], %29[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %29{%24} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%30[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %31 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply #map()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%25 = stream.timepoint.immediate => !stream.timepoint |
|
%26 = stream.timepoint.immediate => !stream.timepoint |
|
%27 = stream.timepoint.immediate => !stream.timepoint |
|
%28 = stream.timepoint.join max(%25, %26, %27) => !stream.timepoint |
|
%results, %result_timepoint = stream.async.execute await(%28) => with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%31:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%34 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%35 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%36 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %34, %35, %36 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%31#0[%c0 to %17 for %17], %31#1[%c0 to %20 for %20], %31#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %31#2{%24} |
|
%33 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%32[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %33 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%29 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply #map()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%25 = stream.timepoint.immediate => !stream.timepoint |
|
%26 = stream.timepoint.immediate => !stream.timepoint |
|
%27 = stream.timepoint.immediate => !stream.timepoint |
|
%28 = stream.timepoint.join max(%25, %26, %27) => !stream.timepoint |
|
%results, %result_timepoint = stream.async.execute await(%28) => with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%31:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%34 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%35 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%36 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %34, %35, %36 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%31#0[%c0 to %17 for %17], %31#1[%c0 to %20 for %20], %31#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %31#2{%24} |
|
%33 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%32[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %33 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%29 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %30 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After Canonicalizer (canonicalize) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After CSE (cse) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10] |
|
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
|
|
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply #map()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply #map()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major) |
|
%12 = arith.muli %10, %c4 : index |
|
%13 = arith.muli %12, %11 : index |
|
%14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} |
|
%15 = affine.apply #map()[%0] |
|
%16 = arith.muli %15, %c64 : index |
|
%17 = arith.muli %16, %1 : index |
|
%18 = affine.apply #map()[%6] |
|
%19 = arith.muli %18, %c64 : index |
|
%20 = arith.muli %19, %5 : index |
|
%21 = affine.apply #map()[%10] |
|
%22 = affine.apply #map()[%11] |
|
%23 = arith.muli %21, %c1024 : index |
|
%24 = arith.muli %23, %22 : index |
|
%results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} { |
|
%27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) { |
|
%30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17} |
|
%31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20} |
|
%32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24} |
|
stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24} |
|
} |
|
%28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24} |
|
%29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13} |
|
stream.yield %29 : !stream.resource<external>{%13} |
|
} => !stream.timepoint |
|
%25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13} |
|
%26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view |
|
return %26 : !hal.buffer_view |
|
} |
|
} |
|
|
|
|
|
// -----// IR Dump After IPO (iree-util-ipo) //----- // |
|
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> |
|
#map = affine_map<()[s0] -> (s0 ceildiv 16)> |
|
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}> |
|
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { |
|
stream.executable private @matmul_dynamic_dispatch_0 { |
|
stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_1 { |
|
stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} |
|
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32> |
|
%6 = affine.apply #map()[%0] |
|
%7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32> |
|
%pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32> |
|
flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_2 { |
|
stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%cst = arith.constant 0.000000e+00 : f32 |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> |
|
%7 = affine.apply #map()[%1] |
|
%8 = affine.apply #map()[%0] |
|
%9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32> |
|
%pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_3 { |
|
stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index |
|
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index |
|
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index |
|
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} |
|
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} |
|
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32> |
|
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32> |
|
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32> |
|
%12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> |
|
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} |
|
return |
|
} |
|
} |
|
} |
|
stream.executable private @matmul_dynamic_dispatch_4 { |
|
stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { |
|
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 |
|
stream.return %x, %y, %z : index, index, index |
|
} |
|
builtin.module { |
|
func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) { |
|
%c0 = arith.constant 0 : index |
|
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index |
|
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index |
|
%2 = flow.dispatch.workload.ordinal %arg3, 2 : index |
|
%3 = flow.dispatch.workload.ordinal %arg4, 3 : index |
|
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} |
|
%5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32> |
|
%7 = tensor.empty(%2, %3) : tensor<?x?xf32> |
|
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32> |
|
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3} |
|
return |
|
} |
|
} |
|
} |
|
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { |
|
%c1024 = arith.constant 1024 : index |
|
%c64 = arith.constant 64 : index |
|
%c4 = arith.constant 4 : index |
|
%c0 = arith.constant 0 : index |
|
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index |
|
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index |
|
%element_type_f32 = hal.element_type<f32> : i32 |
|
%dense_row_major = hal.encoding_type<dense_row_major> : i32 |
|
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) |
|
%2 = arith.muli %0, %c4 : index |
|
%3 = arith.muli %2, %1 : index |
|
%4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3} |
|
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index |
|
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index |
|
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major) |
|
%7 = arith.muli %5, %c4 : index |
|
%8 = arith.muli %7, %6 : index |
|
%9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8} |
|
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index |
|
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index |
|
hal.bu |