Skip to content

Instantly share code, notes, and snippets.

@hanhanW

hanhanW/log2 Secret

Created March 22, 2024 16:58
Show Gist options
  • Save hanhanW/d9ee3111c5f86b0e7ad7ebdac46fe7c9 to your computer and use it in GitHub Desktop.
Save hanhanW/d9ee3111c5f86b0e7ad7ebdac46fe7c9 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
func.func @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After AutoInputConversionPipeline (iree-auto-input-conversion) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
func.func @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After IREEImportPublic (iree-import-public) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After ConvertMeshToFlow (iree-convert-mesh-to-flow) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = util.call @_batch_matmul(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
util.func private @_batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func private @_batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<64x968x1280xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
util.return %2 : tensor<64x968x1280xf16>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = util.call @_batch_matmul(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Inliner (inline) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After RemoveZeroExtentTensors (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After DetachElementwiseFromNamedOps (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After EraseUnusedLinalgOperands (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ExpandTensorShapes (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After DecomposeConcat (iree-global-opt-decompose-concat) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FuseDequantizationMatmul (iree-global-opt-fuse-dequantization-matmul) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FuseHorizontalContractions (iree-global-opt-fuse-horizontal-contractions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After SetEncoding (iree-global-opt-set-encoding) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
#map4 = affine_map<()[s0] -> ((64 ceildiv s0) * s0)>
#map5 = affine_map<()[s0] -> ((968 ceildiv s0) * s0)>
#map6 = affine_map<()[s0] -> ((1280 ceildiv s0) * s0)>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x968x1280xf16, #iree_linalg_ext.encoding<role = LHS, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
%3 = affine.apply #map3()[%2#0, %c64]
%4 = affine.apply #map3()[%2#1, %c968]
%5 = affine.apply #map3()[%2#2, %c1280]
%padded = tensor.pad %0 low[0, 0, 0] high[%3, %4, %5] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<64x968x1280xf16> to tensor<?x?x?xf16>
%6 = iree_linalg_ext.set_encoding %padded : tensor<?x?x?xf16> -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = LHS, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
%7:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x1280x1280xf16, #iree_linalg_ext.encoding<role = RHS, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
%8 = affine.apply #map3()[%7#0, %c64]
%9 = affine.apply #map3()[%7#1, %c1280]
%10 = affine.apply #map3()[%7#2, %c1280]
%padded_0 = tensor.pad %1 low[0, 0, 0] high[%8, %9, %10] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<64x1280x1280xf16> to tensor<?x?x?xf16>
%11 = iree_linalg_ext.set_encoding %padded_0 : tensor<?x?x?xf16> -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RHS, element_types = [f16, f16, f16], original_type = tensor<64x1280x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
%12:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x968x1280xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
%13 = affine.apply #map4()[%12#0]
%14 = affine.apply #map5()[%12#1]
%15 = affine.apply #map6()[%12#2]
%16 = tensor.empty(%13, %14, %15) : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
%17 = linalg.fill ins(%cst : f16) outs(%16 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
%18 = linalg.batch_matmul ins(%6, %11 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = LHS, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RHS, element_types = [f16, f16, f16], original_type = tensor<64x1280x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) outs(%17 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
%19 = iree_linalg_ext.unset_encoding %18 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role = RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?x?xf16>
%extracted_slice = tensor.extract_slice %19[0, 0, 0] [64, 968, 1280] [1, 1, 1] : tensor<?x?x?xf16> to tensor<64x968x1280xf16>
%20 = hal.tensor.export %extracted_slice "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %20 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeEncodingIntoNop (iree-codegen-materialize-encoding-into-nop) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%padded = tensor.pad %0 low[0, 0, 0] high[%c0, %c0, %c0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<64x968x1280xf16> to tensor<?x?x?xf16>
%padded_0 = tensor.pad %1 low[0, 0, 0] high[%c0, %c0, %c0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<64x1280x1280xf16> to tensor<?x?x?xf16>
%2 = tensor.empty(%c64, %c968, %c1280) : tensor<?x?x?xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<?x?x?xf16>) -> tensor<?x?x?xf16>
%4 = linalg.batch_matmul ins(%padded, %padded_0 : tensor<?x?x?xf16>, tensor<?x?x?xf16>) outs(%3 : tensor<?x?x?xf16>) -> tensor<?x?x?xf16>
%extracted_slice = tensor.extract_slice %4[0, 0, 0] [64, 968, 1280] [1, 1, 1] : tensor<?x?x?xf16> to tensor<64x968x1280xf16>
%5 = hal.tensor.export %extracted_slice "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After MaterializeHomogeneousEncodings (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After SimplifyPackUnpack (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After GlobalLoopInvariantCodeMotion (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After JitGlobals (iree-consteval-jit-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After InjectTensorTracing (iree-flow-inject-tensor-tracing) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After InterchangeTransposeGenericOps (iree-flow-interchange-transpose-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FormScalarDispatches (iree-flow-form-scalar-dispatches) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
%6 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.return %6 : tensor<64x968x1280xf16>
}
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CloneProducersIntoDispatchRegions (iree-flow-clone-producers-into-dispatch-regions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
%6 = tensor.empty() : tensor<64x968x1280xf16>
%cst_0 = arith.constant 0.000000e+00 : f16
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.return %8 : tensor<64x968x1280xf16>
}
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
%6 = tensor.empty() : tensor<64x968x1280xf16>
%cst_0 = arith.constant 0.000000e+00 : f16
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.return %8 : tensor<64x968x1280xf16>
}
%5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %5 : !hal.buffer_view
}
// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After OutlineDispatchExterns (iree-flow-outline-dispatch-externs) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
(%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%6 = tensor.empty() : tensor<64x968x1280xf16>
%7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatches (iree-flow-annotate-dispatches) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After CSE (cse) //----- //
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After InjectTensorTracing (iree-flow-inject-tensor-tracing) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineConstants (iree-util-outline-constants) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
flow.executable private @batch_matmul_dispatch_0 {
flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%2 = tensor.empty() : tensor<64x968x1280xf16>
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
%2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
%3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%cst = arith.constant 0.000000e+00 : f16
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
%c64 = arith.constant 64 : index
%c968 = arith.constant 968 : index
%c1280 = arith.constant 1280 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%element_type_f16_0 = hal.element_type<f16> : i32
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
%c64_2 = arith.constant 64 : index
%c1280_3 = arith.constant 1280 : index
%c1280_4 = arith.constant 1280 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64_2, %c1280_3, %c1280_4]) type(%element_type_f16_0) encoding(%dense_row_major_1)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%c0 = arith.constant 0 : index
%6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%cst = arith.constant 0.000000e+00 : f16
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
%c64 = arith.constant 64 : index
%c968 = arith.constant 968 : index
%c1280 = arith.constant 1280 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%element_type_f16_0 = hal.element_type<f16> : i32
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
%c64_2 = arith.constant 64 : index
%c1280_3 = arith.constant 1280 : index
%c1280_4 = arith.constant 1280 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64_2, %c1280_3, %c1280_4]) type(%element_type_f16_0) encoding(%dense_row_major_1)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%c0 = arith.constant 0 : index
%6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%element_type_f16_0 = hal.element_type<f16> : i32
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16_0) encoding(%dense_row_major_1)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
}
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
}
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.timepoint.immediate => !stream.timepoint
%3 = stream.timepoint.immediate => !stream.timepoint
%4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
%results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %7 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%2 = stream.timepoint.immediate => !stream.timepoint
%3 = stream.timepoint.immediate => !stream.timepoint
%4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
%results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %7 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
%4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
stream.yield %4 : !stream.resource<external>{%c158597120}
} => !stream.timepoint
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
%3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%c0_0 = arith.constant 0 : index
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%c0_0 = arith.constant 0 : index
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%c0_0 = arith.constant 0 : index
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%c0_0 = arith.constant 0 : index
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0, %c0, %c0 : index, index, index) {
ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0, %c0, %c0 : index, index, index) {
ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%c32_i64 = arith.constant 32 : i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%c32_i64_0 = arith.constant 32 : i64
%7 = arith.shli %6, %c32_i64_0 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%c32_i64_1 = arith.constant 32 : i64
%12 = arith.shli %11, %c32_i64_1 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%c0_0 = arith.constant 0 : index
%c0_i64 = arith.constant 0 : i64
%c0_i32 = arith.constant 0 : i32
%c32_i64 = arith.constant 32 : i64
%c0_i64_1 = arith.constant 0 : i64
%c0_i32_2 = arith.constant 0 : i32
%c0_i64_3 = arith.constant 0 : i64
%c0_i32_4 = arith.constant 0 : i32
%c32_i64_5 = arith.constant 32 : i64
%c0_i64_6 = arith.constant 0 : i64
%c0_i32_7 = arith.constant 0 : i32
%c0_i64_8 = arith.constant 0 : i64
%c0_i32_9 = arith.constant 0 : i32
%c32_i64_10 = arith.constant 32 : i64
%c0_i64_11 = arith.constant 0 : i64
%c0_i32_12 = arith.constant 0 : i32
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%cst = arith.constant 0.000000e+00 : f16
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%cst = arith.constant 0.000000e+00 : f16
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%cst = arith.constant 0.000000e+00 : f16
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%cst = arith.constant 0.000000e+00 : f16
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %arg3 : i32 to i64
%1 = arith.extui %arg4 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %arg5 : i32 to i64
%6 = arith.extui %arg6 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %arg7 : i32 to i64
%11 = arith.extui %arg8 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant 0.000000e+00 : f16
%c32_i64 = arith.constant 32 : i64
%0 = arith.extui %c0_i32 : i32 to i64
%1 = arith.extui %c0_i32 : i32 to i64
%2 = arith.shli %1, %c32_i64 : i64
%3 = arith.ori %0, %2 : i64
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
%5 = arith.extui %c0_i32 : i32 to i64
%6 = arith.extui %c0_i32 : i32 to i64
%7 = arith.shli %6, %c32_i64 : i64
%8 = arith.ori %5, %7 : i64
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
%10 = arith.extui %c0_i32 : i32 to i64
%11 = arith.extui %c0_i32 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
%15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%20 = tensor.empty() : tensor<64x968x1280xf16>
%21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c0_i32 = arith.constant 0 : i32
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
stream.executable private @batch_matmul_dispatch_0 {
stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
hal.executable private @batch_matmul_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@rocm_hsaco_fb::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
hal.executable private @batch_matmul_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
}
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
%c209715200 = arith.constant 209715200 : index
%c158597120 = arith.constant 158597120 : index
%c0 = arith.constant 0 : index
%c1280 = arith.constant 1280 : index
%c968 = arith.constant 968 : index
%c64 = arith.constant 64 : index
%element_type_f16 = hal.element_type<f16> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
%2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
stream.cmd.dispatch @batch_matmul_dispatch_0::@rocm_hsaco_fb::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
}
} => !stream.timepoint
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
%4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
util.return %4 : !hal.buffer_view
}
}
// -----// IR Dump After GPUGeneralizeNamedOps (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
chosen MMA schedule:
intrinsic (M, N, K) = (16, 16, 16)
subgroup count (M, N) = (2, 2)
subgroup tile count (M, N, K) = (2, 4, 4)
// -----// IR Dump After LLVMGPUSelectLoweringStrategy (iree-llvmgpu-select-lowering-strategy) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable private @batch_matmul_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
%5 = tensor.empty() : tensor<64x968x1280xf16>
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
%7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device):
%c10 = arith.constant 10 : index
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
hal.return %c10, %c16, %c64 : index, index, index
}
builtin.module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c128 = arith.constant 128 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %4, 0], sizes = [1, %3, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %6], sizes = [1, 1280, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x?xf16>
%8 = tensor.empty(%3) : tensor<1x?x128xf16>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%8 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%cast = tensor.cast %7 : tensor<1x1280x?xf16> to tensor<1x1280x128xf16>
%10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%5, %cast : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%cast_0 = tensor.cast %10 : tensor<1x?x128xf16> to tensor<1x?x?xf16>
%11 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%12 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
flow.dispatch.tensor.store %cast_0, %2, offsets = [%workgroup_id_z, %11, %12], sizes = [1, %3, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c128 = arith.constant 128 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x?xf16>
%cast = tensor.cast %6 : tensor<1x?x?xf16> to tensor<1x?x128xf16>
%workgroup_id_x_0 = hal.interface.workgroup.id[0] : index
%workgroup_id_y_1 = hal.interface.workgroup.id[1] : index
%workgroup_id_z_2 = hal.interface.workgroup.id[2] : index
%7 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y_1]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
%9 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z_2, %8, 0], sizes = [1, %7, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
%11 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z_2, 0, %10], sizes = [1, 1280, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x?xf16>
%12 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%cast : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%cast_3 = tensor.cast %11 : tensor<1x1280x?xf16> to tensor<1x1280x128xf16>
%13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%9, %cast_3 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%12 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%cast_4 = tensor.cast %13 : tensor<1x?x128xf16> to tensor<1x?x?xf16>
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
%15 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
flow.dispatch.tensor.store %cast_4, %2, offsets = [%workgroup_id_z_2, %14, %15], sizes = [1, %7, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%workgroup_id_x_0 = hal.interface.workgroup.id[0] : index
%workgroup_id_y_1 = hal.interface.workgroup.id[1] : index
%workgroup_id_z_2 = hal.interface.workgroup.id[2] : index
%7 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y_1]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
%9 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z_2, %8, 0], sizes = [1, %7, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
%11 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z_2, 0, %10], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%12 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%9, %11 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%12 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
%15 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
flow.dispatch.tensor.store %13, %2, offsets = [%workgroup_id_z_2, %14, %15], sizes = [1, %7, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After ReorderWorkgroups (iree-codegen-reorder-workgroups) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
%10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
padding parallel dims
candidate: %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
[linalg-padding]: Start rewriteAsPaddedOp : %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----could not compute a bounding box for padding
[linalg-padding]: ----Fallback to use pre-configured smallest static bounds
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x1280xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 128
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----could not compute a bounding box for padding
[linalg-padding]: ----Fallback to use pre-configured smallest static bounds
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 128
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x128xf16>[linalg-padding]: --cloned padded op: %13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %8 : tensor<1x64x1280xf16>, tensor<1x1280x128xf16>) outs(%padded_4 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
// -----// IR Dump After LLVMGPUPromoteMatmulToFitMMA (iree-llvmgpu-promote-matmul-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%dim = tensor.dim %6, %c1 : tensor<1x?x1280xf16>
%extracted_slice = tensor.extract_slice %6[0, 0, 0] [1, %dim, 1280] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%padded = tensor.pad %extracted_slice low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg0: index, %arg1: index, %arg2: index):
tensor.yield %cst : f16
} : tensor<1x?x1280xf16> to tensor<1x64x1280xf16>
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %7 : tensor<1x64x1280xf16>, tensor<1x1280x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%extracted_slice_0 = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice_0, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After GPUTensorTileToSerialLoops (iree-codegen-gpu-tensor-tile-to-serial-loops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%dim = tensor.dim %6, %c1 : tensor<1x?x1280xf16>
%extracted_slice = tensor.extract_slice %6[0, 0, 0] [1, %dim, 1280] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%padded = tensor.pad %extracted_slice low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg0: index, %arg1: index, %arg2: index):
tensor.yield %cst : f16
} : tensor<1x?x1280xf16> to tensor<1x64x1280xf16>
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12:2 = affine.delinearize_index %arg0 into (%c1, %c20) : index, index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12#1]
%extracted_slice_1 = tensor.extract_slice %padded[%12#0, 0, %13] [1, 64, 64] [1, 1, 1] : tensor<1x64x1280xf16> to tensor<1x64x64xf16>
%extracted_slice_2 = tensor.extract_slice %7[%12#0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_3 = tensor.extract_slice %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%inserted_slice = tensor.insert_slice %14 into %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> into tensor<1x64x128xf16>
scf.yield %inserted_slice : tensor<1x64x128xf16>
}
%extracted_slice_0 = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice_0, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
padding reduction dims
candidate: %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
[linalg-padding]: Start rewriteAsPaddedOp : %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x64xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x128xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --cloned padded op: %15 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded_5, %padded_7 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
// -----// IR Dump After LLVMGPUPromoteMatmulToFitMMA (iree-llvmgpu-promote-matmul-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12:2 = affine.delinearize_index %arg0 into (%c1, %c20) : index, index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12#1]
%extracted_slice_0 = tensor.extract_slice %7[%12#0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_1 = tensor.extract_slice %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%extracted_slice_2 = tensor.extract_slice %6[%12#0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%padded = tensor.pad %extracted_slice_2 nofold low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x?x64xf16> to tensor<1x64x64xf16>
%padded_3 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_3 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%inserted_slice = tensor.insert_slice %14 into %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> into tensor<1x64x128xf16>
scf.yield %inserted_slice : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12 = affine.delinearize_index %arg0 into (%c20) : index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
%extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x?x64xf16> to tensor<1x64x64xf16>
%padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
scf.yield %14 : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12 = affine.delinearize_index %arg0 into (%c20) : index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
%extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x?x64xf16> to tensor<1x64x64xf16>
%padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
scf.yield %14 : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After OptimizeTensorInsertExtractSlices (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12 = affine.delinearize_index %arg0 into (%c20) : index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
%extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x?x64xf16> to tensor<1x64x64xf16>
%padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
scf.yield %14 : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOps (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
%9 = tensor.empty() : tensor<1x64x128xf16>
%10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
%11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
%12 = affine.delinearize_index %arg0 into (%c20) : index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
%extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x?x64xf16> to tensor<1x64x64xf16>
%padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<1x64x128xf16> to tensor<1x64x128xf16>
%14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
scf.yield %14 : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = tensor.empty() : tensor<1x64x128xf16>
%9 = vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
%10 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %9) -> (tensor<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice_1 = tensor.extract_slice %7[0, %12, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
%extracted_slice_2 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16>, vector<1x64x128xf16>
%15 = vector.transfer_read %arg1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16>, vector<1x64x128xf16>
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %15 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
%17 = vector.transfer_write %16, %arg1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
scf.yield %17 : tensor<1x64x128xf16>
}
%extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlices (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = tensor.empty() : tensor<1x64x128xf16>
%9 = vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
%10:2 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %9, %arg2 = %cst) -> (tensor<1x64x128xf16>, vector<1x64x128xf16>) {
%12 = affine.delinearize_index %arg0 into (%c20) : index
%13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%14 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%15 = vector.transfer_read %7[%c0, %13, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %arg2 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %arg1, %16 : tensor<1x64x128xf16>, vector<1x64x128xf16>
}
%11 = vector.transfer_write %10#1, %10#0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
%extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = tensor.empty() : tensor<1x64x128xf16>
%9 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %15 : vector<1x64x128xf16>
}
%10 = vector.transfer_write %9, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
%extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = tensor.empty() : tensor<1x64x128xf16>
%9 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice_1 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %15 : vector<1x64x128xf16>
}
%10 = vector.transfer_write %9, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
%extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After LLVMGPUFoldExtractSliceIntoXferWrite (iree-llvmgpu-fold-extract-slice-into-xfer-write) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %15 : vector<1x64x128xf16>
}
%9 = tensor.empty(%5) : tensor<1x?x128xf16>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After GPUVectorAlloc (iree-codegen-gpu-vector-alloc) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
gpu.barrier
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%c0_1 = arith.constant 0 : index
%16 = vector.transfer_write %13, %15[%c0_1, %c0_1, %c0_1] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%c0_2 = arith.constant 0 : index
%19 = vector.transfer_write %14, %18[%c0_2, %c0_2, %c0_2] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%c0_3 = arith.constant 0 : index
%cst_4 = arith.constant 0.000000e+00 : f16
%21 = vector.transfer_read %17[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%c0_5 = arith.constant 0 : index
%cst_6 = arith.constant 0.000000e+00 : f16
%22 = vector.transfer_read %20[%c0_5, %c0_5, %c0_5], %cst_6 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %23 : vector<1x64x128xf16>
}
%9 = tensor.empty(%5) : tensor<1x?x128xf16>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
gpu.barrier
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%16 = vector.transfer_write %13, %15[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%19 = vector.transfer_write %14, %18[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%21 = vector.transfer_read %17[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%22 = vector.transfer_read %20[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %23 : vector<1x64x128xf16>
}
%9 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
%7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
%8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%11 = affine.delinearize_index %arg0 into (%c20) : index
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
%extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
%13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
%14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
gpu.barrier
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%16 = vector.transfer_write %13, %15[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
%18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%19 = vector.transfer_write %14, %18[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
%20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%21 = vector.transfer_read %17[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%22 = vector.transfer_read %20[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %23 : vector<1x64x128xf16>
}
%9 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc_5, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, #gpu.address_space<workgroup>> to memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %alloc_5, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, #gpu.address_space<workgroup>> to memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_2, %subview_2 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
%alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
// -----// IR Dump After HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_1 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_1 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
vector.transfer_write %9, %alloc_0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc_0[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_0 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUNormalizeContractMaps (iree-llvmgpu-normalize-contract-maps) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_1 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_1 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
vector.transfer_write %9, %alloc_0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc_0[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
scf.yield %13 : vector<1x64x128xf16>
}
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_0 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUCastTypeToFitMMA (iree-llvmgpu-cast-type-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c20 = arith.constant 20 : index
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_0) -> (vector<1x64x128xf16>) {
%7 = affine.delinearize_index %arg0 into (%c20) : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
%subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
%10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
gpu.barrier
vector.transfer_write %9, %alloc_1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%11 = vector.transfer_read %alloc_1[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
%12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
%13 = arith.extf %arg1 : vector<1x64x128xf16> to vector<1x64x128xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %13 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf32>
%15 = arith.truncf %14 : vector<1x64x128xf32> to vector<1x64x128xf16>
scf.yield %15 : vector<1x64x128xf16>
}
%subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_1 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUVectorDistribute (iree-llvmgpu-vector-distribute) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%41 = affine.delinearize_index %arg0 into (%c20) : index
%42 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%41]
%subview_6 = memref.subview %subview[0, 0, %42] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%43:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c32, %c8) : index, index, index, index, index, index
%44 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#5]
%45 = vector.transfer_read %subview_6[%c0, %43#4, %44], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%46 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%43#4]
%47 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#5]
%48 = vector.transfer_read %subview_6[%c0, %46, %47], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%49:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c16, %c16) : index, index, index, index, index, index
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%49#4, %41]
%51 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
%52 = vector.transfer_read %subview_4[%c0, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%49#4, %41]
%54 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
%55 = vector.transfer_read %subview_4[%c0, %53, %54], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%49#4, %41]
%57 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
%58 = vector.transfer_read %subview_4[%c0, %56, %57], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%49#4, %41]
%60 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
%61 = vector.transfer_read %subview_4[%c0, %59, %60], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
%62:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c32, %c8) : index, index, index, index, index, index
%63 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#5]
vector.transfer_write %45, %alloc_3[%c0, %62#4, %63] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
%64 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%62#4]
%65 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#5]
vector.transfer_write %48, %alloc_3[%c0, %64, %65] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%66:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c16, %c16) : index, index, index, index, index, index
%67 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
vector.transfer_write %52, %alloc[%c0, %66#4, %67] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%68 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%66#4]
%69 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
vector.transfer_write %55, %alloc[%c0, %68, %69] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%70 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%66#4]
%71 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
vector.transfer_write %58, %alloc[%c0, %70, %71] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%72 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%66#4]
%73 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
vector.transfer_write %61, %alloc[%c0, %72, %73] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%74:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
%75 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
%76 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#5]
%77 = vector.transfer_read %alloc_3[%c0, %75, %76], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%78 = vector.insert_strided_slice %77, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%79 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
%80 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#5]
%81 = vector.transfer_read %alloc_3[%c0, %79, %80], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%82 = vector.insert_strided_slice %81, %78 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%83 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
%84 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#5]
%85 = vector.transfer_read %alloc_3[%c0, %83, %84], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%87 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
%88 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#5]
%89 = vector.transfer_read %alloc_3[%c0, %87, %88], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%91 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
%92 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#5]
%93 = vector.transfer_read %alloc_3[%c0, %91, %92], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%94 = vector.insert_strided_slice %93, %90 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
%96 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#5]
%97 = vector.transfer_read %alloc_3[%c0, %95, %96], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%98 = vector.insert_strided_slice %97, %94 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
%100 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#5]
%101 = vector.transfer_read %alloc_3[%c0, %99, %100], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%102 = vector.insert_strided_slice %101, %98 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%103 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
%104 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#5]
%105 = vector.transfer_read %alloc_3[%c0, %103, %104], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%106 = vector.insert_strided_slice %105, %102 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%107:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
%108 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
%109 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
%110 = vector.transfer_read %alloc[%c0, %108, %109], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%112 = vector.insert_strided_slice %111, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%113 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
%114 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
%115 = vector.transfer_read %alloc[%c0, %113, %114], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%116 = vector.transpose %115, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%117 = vector.insert_strided_slice %116, %112 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%118 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
%119 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
%120 = vector.transfer_read %alloc[%c0, %118, %119], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%122 = vector.insert_strided_slice %121, %117 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%123 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
%124 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
%125 = vector.transfer_read %alloc[%c0, %123, %124], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%127 = vector.insert_strided_slice %126, %122 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%128 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
%129 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
%130 = vector.transfer_read %alloc[%c0, %128, %129], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%131 = vector.transpose %130, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%132 = vector.insert_strided_slice %131, %127 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%133 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
%134 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
%135 = vector.transfer_read %alloc[%c0, %133, %134], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%136 = vector.transpose %135, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%137 = vector.insert_strided_slice %136, %132 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%138 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
%139 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
%140 = vector.transfer_read %alloc[%c0, %138, %139], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%141 = vector.transpose %140, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%142 = vector.insert_strided_slice %141, %137 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%143 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
%144 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
%145 = vector.transfer_read %alloc[%c0, %143, %144], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%146 = vector.transpose %145, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%147 = vector.insert_strided_slice %146, %142 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%148 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
%149 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
%150 = vector.transfer_read %alloc[%c0, %148, %149], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%151 = vector.transpose %150, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%152 = vector.insert_strided_slice %151, %147 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%153 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
%154 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
%155 = vector.transfer_read %alloc[%c0, %153, %154], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%156 = vector.transpose %155, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%157 = vector.insert_strided_slice %156, %152 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%158 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
%159 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
%160 = vector.transfer_read %alloc[%c0, %158, %159], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%161 = vector.transpose %160, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%162 = vector.insert_strided_slice %161, %157 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%163 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
%164 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
%165 = vector.transfer_read %alloc[%c0, %163, %164], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%166 = vector.transpose %165, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%167 = vector.insert_strided_slice %166, %162 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%168 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
%169 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
%170 = vector.transfer_read %alloc[%c0, %168, %169], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%171 = vector.transpose %170, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%172 = vector.insert_strided_slice %171, %167 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%173 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
%174 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
%175 = vector.transfer_read %alloc[%c0, %173, %174], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%176 = vector.transpose %175, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%177 = vector.insert_strided_slice %176, %172 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%178 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
%179 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
%180 = vector.transfer_read %alloc[%c0, %178, %179], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%181 = vector.transpose %180, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%182 = vector.insert_strided_slice %181, %177 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%183 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
%184 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
%185 = vector.transfer_read %alloc[%c0, %183, %184], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%186 = vector.transpose %185, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%187 = vector.insert_strided_slice %186, %182 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%188 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
%189 = vector.extract %188[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%190 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%191 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%192 = vector.shape_cast %190 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%193 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%194 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%195 = amdgpu.mfma %192 * %193 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%197 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%198 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%199 = vector.shape_cast %197 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%201 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%202 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%203 = vector.shape_cast %201 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%204 = vector.shape_cast %202 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%206 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%207 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%208 = vector.shape_cast %206 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%209 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.shape_cast %210 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%212 = vector.insert %211, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%213 = vector.extract %188[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%214 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%215 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%216 = vector.shape_cast %214 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%217 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%218 = vector.shape_cast %213 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%219 = amdgpu.mfma %216 * %217 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%221 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%222 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%223 = vector.shape_cast %221 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%224 = amdgpu.mfma %222 * %223 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%226 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%227 = vector.shape_cast %225 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%228 = vector.shape_cast %226 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%229 = amdgpu.mfma %227 * %228 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%230 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%231 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%232 = vector.shape_cast %230 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%233 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%234 = amdgpu.mfma %232 * %233 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%235 = vector.shape_cast %234 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%236 = vector.insert %235, %212 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%237 = vector.extract %188[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%238 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%239 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%240 = vector.shape_cast %238 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%241 = vector.shape_cast %239 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%242 = vector.shape_cast %237 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%243 = amdgpu.mfma %240 * %241 + %242 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%244 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%245 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%246 = vector.shape_cast %244 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%247 = vector.shape_cast %245 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%248 = amdgpu.mfma %246 * %247 + %243 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%249 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%250 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%251 = vector.shape_cast %249 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%252 = vector.shape_cast %250 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%253 = amdgpu.mfma %251 * %252 + %248 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%254 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%255 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%256 = vector.shape_cast %254 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%257 = vector.shape_cast %255 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%258 = amdgpu.mfma %256 * %257 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%259 = vector.shape_cast %258 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%260 = vector.insert %259, %236 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%261 = vector.extract %188[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%262 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%263 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%264 = vector.shape_cast %262 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%265 = vector.shape_cast %263 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%266 = vector.shape_cast %261 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%267 = amdgpu.mfma %264 * %265 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%268 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%269 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%270 = vector.shape_cast %268 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%271 = vector.shape_cast %269 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%272 = amdgpu.mfma %270 * %271 + %267 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%273 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%274 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%275 = vector.shape_cast %273 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%276 = vector.shape_cast %274 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%277 = amdgpu.mfma %275 * %276 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%278 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%279 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%280 = vector.shape_cast %278 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%281 = vector.shape_cast %279 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%282 = amdgpu.mfma %280 * %281 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%283 = vector.shape_cast %282 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%284 = vector.insert %283, %260 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%285 = vector.extract %188[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%286 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%287 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%288 = vector.shape_cast %286 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%289 = vector.shape_cast %287 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%290 = vector.shape_cast %285 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%291 = amdgpu.mfma %288 * %289 + %290 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%292 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%293 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%294 = vector.shape_cast %292 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%295 = vector.shape_cast %293 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%296 = amdgpu.mfma %294 * %295 + %291 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%297 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%298 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%299 = vector.shape_cast %297 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%300 = vector.shape_cast %298 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%301 = amdgpu.mfma %299 * %300 + %296 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%302 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%303 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%304 = vector.shape_cast %302 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%305 = vector.shape_cast %303 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%306 = amdgpu.mfma %304 * %305 + %301 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%307 = vector.shape_cast %306 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%308 = vector.insert %307, %284 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%309 = vector.extract %188[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%310 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%311 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%312 = vector.shape_cast %310 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%313 = vector.shape_cast %311 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%314 = vector.shape_cast %309 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%315 = amdgpu.mfma %312 * %313 + %314 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%316 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%317 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%318 = vector.shape_cast %316 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%319 = vector.shape_cast %317 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%320 = amdgpu.mfma %318 * %319 + %315 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%321 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%322 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%323 = vector.shape_cast %321 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%324 = vector.shape_cast %322 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%325 = amdgpu.mfma %323 * %324 + %320 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%326 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%327 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%328 = vector.shape_cast %326 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%329 = vector.shape_cast %327 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%330 = amdgpu.mfma %328 * %329 + %325 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%331 = vector.shape_cast %330 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%332 = vector.insert %331, %308 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%333 = vector.extract %188[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%334 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%335 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%336 = vector.shape_cast %334 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%337 = vector.shape_cast %335 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%338 = vector.shape_cast %333 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%339 = amdgpu.mfma %336 * %337 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%340 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%341 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%342 = vector.shape_cast %340 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%343 = vector.shape_cast %341 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%344 = amdgpu.mfma %342 * %343 + %339 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%345 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%346 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%347 = vector.shape_cast %345 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%348 = vector.shape_cast %346 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%349 = amdgpu.mfma %347 * %348 + %344 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%350 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%351 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%352 = vector.shape_cast %350 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%353 = vector.shape_cast %351 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%354 = amdgpu.mfma %352 * %353 + %349 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%355 = vector.shape_cast %354 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%356 = vector.insert %355, %332 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%357 = vector.extract %188[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%358 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%359 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%360 = vector.shape_cast %358 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%361 = vector.shape_cast %359 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%362 = vector.shape_cast %357 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%363 = amdgpu.mfma %360 * %361 + %362 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%364 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%365 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%366 = vector.shape_cast %364 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%367 = vector.shape_cast %365 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%368 = amdgpu.mfma %366 * %367 + %363 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%369 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%370 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%371 = vector.shape_cast %369 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%372 = vector.shape_cast %370 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%373 = amdgpu.mfma %371 * %372 + %368 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%374 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%375 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%376 = vector.shape_cast %374 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%377 = vector.shape_cast %375 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%378 = amdgpu.mfma %376 * %377 + %373 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%379 = vector.shape_cast %378 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%380 = vector.insert %379, %356 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%381 = arith.truncf %380 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
scf.yield %381 : vector<1x2x4x1x1x1x1x1x4xf16>
} {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
%subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#2, %8#6]
%11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
%14 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#2, %8#6]
%15 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%16 = vector.transpose %15, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %16, %subview_5[%c0, %13, %14] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#2, %8#6]
%19 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%20 = vector.transpose %19, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %20, %subview_5[%c0, %17, %18] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#2, %8#6]
%23 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %24, %subview_5[%c0, %21, %22] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%25 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
%26 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#2, %8#6]
%27 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %28, %subview_5[%c0, %25, %26] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
%30 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#2, %8#6]
%31 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%32 = vector.transpose %31, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %32, %subview_5[%c0, %29, %30] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%33 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
%34 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#2, %8#6]
%35 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%36 = vector.transpose %35, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %36, %subview_5[%c0, %33, %34] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%37 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
%38 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#2, %8#6]
%39 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%40 = vector.transpose %39, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %40, %subview_5[%c0, %37, %38] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%41 = affine.delinearize_index %arg0 into (%c20) : index
%42 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%41]
%subview_6 = memref.subview %subview[0, 0, %42] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%43:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%44 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#1]
%45 = vector.transfer_read %subview_6[%c0, %43#0, %44], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%46 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%43#0]
%47 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#1]
%48 = vector.transfer_read %subview_6[%c0, %46, %47], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%49:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%49#0, %41]
%51 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
%52 = vector.transfer_read %subview_4[%c0, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%49#0, %41]
%54 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
%55 = vector.transfer_read %subview_4[%c0, %53, %54], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%49#0, %41]
%57 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
%58 = vector.transfer_read %subview_4[%c0, %56, %57], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%49#0, %41]
%60 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
%61 = vector.transfer_read %subview_4[%c0, %59, %60], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
%62:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%63 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#1]
vector.transfer_write %45, %alloc_3[%c0, %62#0, %63] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
%64 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%62#0]
%65 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#1]
vector.transfer_write %48, %alloc_3[%c0, %64, %65] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%66:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%67 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
vector.transfer_write %52, %alloc[%c0, %66#0, %67] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%68 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%66#0]
%69 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
vector.transfer_write %55, %alloc[%c0, %68, %69] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%70 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%66#0]
%71 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
vector.transfer_write %58, %alloc[%c0, %70, %71] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%72 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%66#0]
%73 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
vector.transfer_write %61, %alloc[%c0, %72, %73] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%74:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%75 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
%76 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#4]
%77 = vector.transfer_read %alloc_3[%c0, %75, %76], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%78 = vector.insert_strided_slice %77, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%79 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
%80 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#4]
%81 = vector.transfer_read %alloc_3[%c0, %79, %80], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%82 = vector.insert_strided_slice %81, %78 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%83 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
%84 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#4]
%85 = vector.transfer_read %alloc_3[%c0, %83, %84], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%87 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
%88 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#4]
%89 = vector.transfer_read %alloc_3[%c0, %87, %88], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%91 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
%92 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#4]
%93 = vector.transfer_read %alloc_3[%c0, %91, %92], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%94 = vector.insert_strided_slice %93, %90 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
%96 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#4]
%97 = vector.transfer_read %alloc_3[%c0, %95, %96], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%98 = vector.insert_strided_slice %97, %94 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
%100 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#4]
%101 = vector.transfer_read %alloc_3[%c0, %99, %100], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%102 = vector.insert_strided_slice %101, %98 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%103 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
%104 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#4]
%105 = vector.transfer_read %alloc_3[%c0, %103, %104], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%106 = vector.insert_strided_slice %105, %102 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%107:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%108 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
%109 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
%110 = vector.transfer_read %alloc[%c0, %108, %109], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%112 = vector.insert_strided_slice %111, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%113 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
%114 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
%115 = vector.transfer_read %alloc[%c0, %113, %114], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%116 = vector.transpose %115, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%117 = vector.insert_strided_slice %116, %112 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%118 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
%119 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
%120 = vector.transfer_read %alloc[%c0, %118, %119], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%122 = vector.insert_strided_slice %121, %117 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%123 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
%124 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
%125 = vector.transfer_read %alloc[%c0, %123, %124], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%127 = vector.insert_strided_slice %126, %122 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%128 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
%129 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
%130 = vector.transfer_read %alloc[%c0, %128, %129], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%131 = vector.transpose %130, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%132 = vector.insert_strided_slice %131, %127 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%133 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
%134 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
%135 = vector.transfer_read %alloc[%c0, %133, %134], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%136 = vector.transpose %135, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%137 = vector.insert_strided_slice %136, %132 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%138 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
%139 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
%140 = vector.transfer_read %alloc[%c0, %138, %139], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%141 = vector.transpose %140, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%142 = vector.insert_strided_slice %141, %137 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%143 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
%144 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
%145 = vector.transfer_read %alloc[%c0, %143, %144], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%146 = vector.transpose %145, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%147 = vector.insert_strided_slice %146, %142 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%148 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
%149 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
%150 = vector.transfer_read %alloc[%c0, %148, %149], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%151 = vector.transpose %150, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%152 = vector.insert_strided_slice %151, %147 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%153 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
%154 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
%155 = vector.transfer_read %alloc[%c0, %153, %154], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%156 = vector.transpose %155, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%157 = vector.insert_strided_slice %156, %152 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%158 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
%159 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
%160 = vector.transfer_read %alloc[%c0, %158, %159], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%161 = vector.transpose %160, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%162 = vector.insert_strided_slice %161, %157 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%163 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
%164 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
%165 = vector.transfer_read %alloc[%c0, %163, %164], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%166 = vector.transpose %165, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%167 = vector.insert_strided_slice %166, %162 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%168 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
%169 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
%170 = vector.transfer_read %alloc[%c0, %168, %169], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%171 = vector.transpose %170, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%172 = vector.insert_strided_slice %171, %167 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%173 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
%174 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
%175 = vector.transfer_read %alloc[%c0, %173, %174], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%176 = vector.transpose %175, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%177 = vector.insert_strided_slice %176, %172 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%178 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
%179 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
%180 = vector.transfer_read %alloc[%c0, %178, %179], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%181 = vector.transpose %180, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%182 = vector.insert_strided_slice %181, %177 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%183 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
%184 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
%185 = vector.transfer_read %alloc[%c0, %183, %184], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%186 = vector.transpose %185, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%187 = vector.insert_strided_slice %186, %182 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%188 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
%189 = vector.extract %188[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%190 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%191 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%192 = vector.shape_cast %190 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%193 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%194 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%195 = amdgpu.mfma %192 * %193 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%197 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%198 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%199 = vector.shape_cast %197 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%201 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%202 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%203 = vector.shape_cast %201 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%204 = vector.shape_cast %202 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%206 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%207 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%208 = vector.shape_cast %206 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%209 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.shape_cast %210 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%212 = vector.insert %211, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%213 = vector.extract %188[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%214 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%215 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%216 = vector.shape_cast %214 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%217 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%218 = vector.shape_cast %213 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%219 = amdgpu.mfma %216 * %217 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%221 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%222 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%223 = vector.shape_cast %221 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%224 = amdgpu.mfma %222 * %223 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%226 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%227 = vector.shape_cast %225 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%228 = vector.shape_cast %226 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%229 = amdgpu.mfma %227 * %228 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%230 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%231 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%232 = vector.shape_cast %230 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%233 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%234 = amdgpu.mfma %232 * %233 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%235 = vector.shape_cast %234 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%236 = vector.insert %235, %212 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%237 = vector.extract %188[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%238 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%239 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%240 = vector.shape_cast %238 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%241 = vector.shape_cast %239 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%242 = vector.shape_cast %237 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%243 = amdgpu.mfma %240 * %241 + %242 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%244 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%245 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%246 = vector.shape_cast %244 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%247 = vector.shape_cast %245 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%248 = amdgpu.mfma %246 * %247 + %243 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%249 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%250 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%251 = vector.shape_cast %249 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%252 = vector.shape_cast %250 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%253 = amdgpu.mfma %251 * %252 + %248 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%254 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%255 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%256 = vector.shape_cast %254 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%257 = vector.shape_cast %255 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%258 = amdgpu.mfma %256 * %257 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%259 = vector.shape_cast %258 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%260 = vector.insert %259, %236 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%261 = vector.extract %188[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%262 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%263 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%264 = vector.shape_cast %262 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%265 = vector.shape_cast %263 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%266 = vector.shape_cast %261 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%267 = amdgpu.mfma %264 * %265 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%268 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%269 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%270 = vector.shape_cast %268 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%271 = vector.shape_cast %269 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%272 = amdgpu.mfma %270 * %271 + %267 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%273 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%274 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%275 = vector.shape_cast %273 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%276 = vector.shape_cast %274 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%277 = amdgpu.mfma %275 * %276 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%278 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%279 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%280 = vector.shape_cast %278 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%281 = vector.shape_cast %279 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%282 = amdgpu.mfma %280 * %281 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%283 = vector.shape_cast %282 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%284 = vector.insert %283, %260 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%285 = vector.extract %188[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%286 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%287 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%288 = vector.shape_cast %286 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%289 = vector.shape_cast %287 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%290 = vector.shape_cast %285 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%291 = amdgpu.mfma %288 * %289 + %290 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%292 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%293 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%294 = vector.shape_cast %292 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%295 = vector.shape_cast %293 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%296 = amdgpu.mfma %294 * %295 + %291 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%297 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%298 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%299 = vector.shape_cast %297 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%300 = vector.shape_cast %298 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%301 = amdgpu.mfma %299 * %300 + %296 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%302 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%303 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%304 = vector.shape_cast %302 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%305 = vector.shape_cast %303 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%306 = amdgpu.mfma %304 * %305 + %301 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%307 = vector.shape_cast %306 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%308 = vector.insert %307, %284 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%309 = vector.extract %188[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%310 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%311 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%312 = vector.shape_cast %310 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%313 = vector.shape_cast %311 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%314 = vector.shape_cast %309 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%315 = amdgpu.mfma %312 * %313 + %314 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%316 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%317 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%318 = vector.shape_cast %316 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%319 = vector.shape_cast %317 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%320 = amdgpu.mfma %318 * %319 + %315 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%321 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%322 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%323 = vector.shape_cast %321 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%324 = vector.shape_cast %322 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%325 = amdgpu.mfma %323 * %324 + %320 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%326 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%327 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%328 = vector.shape_cast %326 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%329 = vector.shape_cast %327 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%330 = amdgpu.mfma %328 * %329 + %325 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%331 = vector.shape_cast %330 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%332 = vector.insert %331, %308 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%333 = vector.extract %188[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%334 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%335 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%336 = vector.shape_cast %334 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%337 = vector.shape_cast %335 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%338 = vector.shape_cast %333 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%339 = amdgpu.mfma %336 * %337 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%340 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%341 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%342 = vector.shape_cast %340 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%343 = vector.shape_cast %341 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%344 = amdgpu.mfma %342 * %343 + %339 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%345 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%346 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%347 = vector.shape_cast %345 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%348 = vector.shape_cast %346 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%349 = amdgpu.mfma %347 * %348 + %344 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%350 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%351 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%352 = vector.shape_cast %350 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%353 = vector.shape_cast %351 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%354 = amdgpu.mfma %352 * %353 + %349 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%355 = vector.shape_cast %354 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%356 = vector.insert %355, %332 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%357 = vector.extract %188[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%358 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%359 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%360 = vector.shape_cast %358 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%361 = vector.shape_cast %359 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%362 = vector.shape_cast %357 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%363 = amdgpu.mfma %360 * %361 + %362 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%364 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%365 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%366 = vector.shape_cast %364 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%367 = vector.shape_cast %365 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%368 = amdgpu.mfma %366 * %367 + %363 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%369 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%370 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%371 = vector.shape_cast %369 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%372 = vector.shape_cast %370 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%373 = amdgpu.mfma %371 * %372 + %368 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%374 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%375 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%376 = vector.shape_cast %374 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%377 = vector.shape_cast %375 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%378 = amdgpu.mfma %376 * %377 + %373 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%379 = vector.shape_cast %378 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%380 = vector.insert %379, %356 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%381 = arith.truncf %380 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
scf.yield %381 : vector<1x2x4x1x1x1x1x1x4xf16>
} {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
%subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
%11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%14 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
%15 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%16 = vector.transpose %15, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %16, %subview_5[%c0, %13, %14] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
%19 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%20 = vector.transpose %19, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %20, %subview_5[%c0, %17, %18] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
%23 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %24, %subview_5[%c0, %21, %22] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%25 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%26 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
%27 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %28, %subview_5[%c0, %25, %26] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%30 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
%31 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%32 = vector.transpose %31, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %32, %subview_5[%c0, %29, %30] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%33 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%34 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
%35 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%36 = vector.transpose %35, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %36, %subview_5[%c0, %33, %34] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%37 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%38 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
%39 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%40 = vector.transpose %39, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %40, %subview_5[%c0, %37, %38] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%31 = affine.delinearize_index %arg0 into (%c20) : index
%32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
%subview_6 = memref.subview %subview[0, 0, %32] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%33:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%34 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%33#1]
%35 = vector.transfer_read %subview_6[%c0, %33#0, %34], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%36 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%33#0]
%37 = vector.transfer_read %subview_6[%c0, %36, %34], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%38:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%39 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%38#0, %31]
%40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%38#1]
%41 = vector.transfer_read %subview_4[%c0, %39, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%38#0, %31]
%43 = vector.transfer_read %subview_4[%c0, %42, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%38#0, %31]
%45 = vector.transfer_read %subview_4[%c0, %44, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%38#0, %31]
%47 = vector.transfer_read %subview_4[%c0, %46, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
vector.transfer_write %35, %alloc_3[%c0, %33#0, %34] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %37, %alloc_3[%c0, %36, %34] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %41, %alloc[%c0, %38#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%48 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%38#0]
vector.transfer_write %43, %alloc[%c0, %48, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%49 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%38#0]
vector.transfer_write %45, %alloc[%c0, %49, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%50 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%38#0]
vector.transfer_write %47, %alloc[%c0, %50, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%51:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%52 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%51#0, %51#5]
%53 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%51#4]
%54 = vector.transfer_read %alloc_3[%c0, %52, %53], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%55 = vector.insert_strided_slice %54, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%51#4]
%57 = vector.transfer_read %alloc_3[%c0, %52, %56], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%51#4]
%60 = vector.transfer_read %alloc_3[%c0, %52, %59], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%51#4]
%63 = vector.transfer_read %alloc_3[%c0, %52, %62], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%51#0, %51#5]
%66 = vector.transfer_read %alloc_3[%c0, %65, %53], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%c0, %65, %56], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%c0, %65, %59], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%c0, %65, %62], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%74 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%51#1, %51#5]
%75 = vector.transfer_read %alloc[%c0, %53, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%76 = vector.transpose %75, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%77 = vector.insert_strided_slice %76, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%78 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%51#1, %51#5]
%79 = vector.transfer_read %alloc[%c0, %53, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%80 = vector.transpose %79, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%81 = vector.insert_strided_slice %80, %77 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%51#1, %51#5]
%83 = vector.transfer_read %alloc[%c0, %53, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%84 = vector.transpose %83, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%85 = vector.insert_strided_slice %84, %81 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%86 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%51#1, %51#5]
%87 = vector.transfer_read %alloc[%c0, %53, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%88 = vector.transpose %87, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%89 = vector.insert_strided_slice %88, %85 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%90 = vector.transfer_read %alloc[%c0, %56, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%91 = vector.transpose %90, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%92 = vector.insert_strided_slice %91, %89 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%93 = vector.transfer_read %alloc[%c0, %56, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%94 = vector.transpose %93, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%95 = vector.insert_strided_slice %94, %92 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%96 = vector.transfer_read %alloc[%c0, %56, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%97 = vector.transpose %96, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%98 = vector.insert_strided_slice %97, %95 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%99 = vector.transfer_read %alloc[%c0, %56, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%100 = vector.transpose %99, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%101 = vector.insert_strided_slice %100, %98 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%102 = vector.transfer_read %alloc[%c0, %59, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%103 = vector.transpose %102, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%104 = vector.insert_strided_slice %103, %101 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%105 = vector.transfer_read %alloc[%c0, %59, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%106 = vector.transpose %105, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%107 = vector.insert_strided_slice %106, %104 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%108 = vector.transfer_read %alloc[%c0, %59, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%109 = vector.transpose %108, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%110 = vector.insert_strided_slice %109, %107 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%111 = vector.transfer_read %alloc[%c0, %59, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%112 = vector.transpose %111, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%113 = vector.insert_strided_slice %112, %110 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%114 = vector.transfer_read %alloc[%c0, %62, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%115 = vector.transpose %114, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%116 = vector.insert_strided_slice %115, %113 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%117 = vector.transfer_read %alloc[%c0, %62, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%118 = vector.transpose %117, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%119 = vector.insert_strided_slice %118, %116 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%120 = vector.transfer_read %alloc[%c0, %62, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%122 = vector.insert_strided_slice %121, %119 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%123 = vector.transfer_read %alloc[%c0, %62, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%124 = vector.transpose %123, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%125 = vector.insert_strided_slice %124, %122 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%126 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
%127 = vector.extract %126[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%128 = vector.extract %73[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%129 = vector.extract %125[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%132 = vector.shape_cast %127 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%133 = amdgpu.mfma %130 * %131 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%134 = vector.extract %73[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%135 = vector.extract %125[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%137 = vector.shape_cast %135 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%138 = amdgpu.mfma %136 * %137 + %133 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%139 = vector.extract %73[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%140 = vector.extract %125[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%144 = vector.extract %73[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%145 = vector.extract %125[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%149 = vector.shape_cast %148 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%150 = vector.insert %149, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%151 = vector.extract %126[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%152 = vector.extract %125[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%153 = vector.shape_cast %152 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%154 = vector.shape_cast %151 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%155 = amdgpu.mfma %130 * %153 + %154 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%156 = vector.extract %125[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%157 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%158 = amdgpu.mfma %136 * %157 + %155 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%159 = vector.extract %125[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%160 = vector.shape_cast %159 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%161 = amdgpu.mfma %141 * %160 + %158 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%162 = vector.extract %125[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%163 = vector.shape_cast %162 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%164 = amdgpu.mfma %146 * %163 + %161 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.shape_cast %164 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%166 = vector.insert %165, %150 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%167 = vector.extract %126[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%168 = vector.extract %125[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%169 = vector.shape_cast %168 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%170 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%171 = amdgpu.mfma %130 * %169 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%172 = vector.extract %125[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%173 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%174 = amdgpu.mfma %136 * %173 + %171 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%175 = vector.extract %125[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%176 = vector.shape_cast %175 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%177 = amdgpu.mfma %141 * %176 + %174 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%178 = vector.extract %125[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%179 = vector.shape_cast %178 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%180 = amdgpu.mfma %146 * %179 + %177 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%181 = vector.shape_cast %180 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%182 = vector.insert %181, %166 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%183 = vector.extract %126[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%184 = vector.extract %125[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%185 = vector.shape_cast %184 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%186 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%187 = amdgpu.mfma %130 * %185 + %186 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%188 = vector.extract %125[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%189 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%190 = amdgpu.mfma %136 * %189 + %187 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%191 = vector.extract %125[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%192 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%193 = amdgpu.mfma %141 * %192 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%194 = vector.extract %125[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%195 = vector.shape_cast %194 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%196 = amdgpu.mfma %146 * %195 + %193 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%197 = vector.shape_cast %196 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%198 = vector.insert %197, %182 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%199 = vector.extract %126[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%200 = vector.extract %73[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%201 = vector.shape_cast %200 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%202 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%203 = amdgpu.mfma %201 * %131 + %202 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%204 = vector.extract %73[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%205 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%206 = amdgpu.mfma %205 * %137 + %203 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%207 = vector.extract %73[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%208 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%209 = amdgpu.mfma %208 * %142 + %206 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%210 = vector.extract %73[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%211 = vector.shape_cast %210 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%212 = amdgpu.mfma %211 * %147 + %209 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%213 = vector.shape_cast %212 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%214 = vector.insert %213, %198 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%215 = vector.extract %126[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%217 = amdgpu.mfma %201 * %153 + %216 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%218 = amdgpu.mfma %205 * %157 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %208 * %160 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %211 * %163 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = vector.shape_cast %220 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%222 = vector.insert %221, %214 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%223 = vector.extract %126[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%224 = vector.shape_cast %223 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%225 = amdgpu.mfma %201 * %169 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = amdgpu.mfma %205 * %173 + %225 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%227 = amdgpu.mfma %208 * %176 + %226 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%228 = amdgpu.mfma %211 * %179 + %227 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%229 = vector.shape_cast %228 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%230 = vector.insert %229, %222 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%231 = vector.extract %126[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%232 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%233 = amdgpu.mfma %201 * %185 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%234 = amdgpu.mfma %205 * %189 + %233 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%235 = amdgpu.mfma %208 * %192 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%236 = amdgpu.mfma %211 * %195 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%237 = vector.shape_cast %236 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%238 = vector.insert %237, %230 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%239 = arith.truncf %238 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
scf.yield %239 : vector<1x2x4x1x1x1x1x1x4xf16>
} {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
%subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
%11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
%14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %15, %subview_5[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
%17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %18, %subview_5[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
%20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %21, %subview_5[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %24, %subview_5[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %26, %subview_5[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %28, %subview_5[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %30, %subview_5[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%31 = affine.delinearize_index %arg0 into (%c20) : index
%32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
%subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
%36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
%38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
%41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
%42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
%45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
%48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
%58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
%59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
%62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
%65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
%68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
%71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
%80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
%84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
%88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
%92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%98 = vector.transfer_read %alloc[%c0, %61, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%99 = vector.transpose %98, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%100 = vector.insert_strided_slice %99, %97 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%101 = vector.transfer_read %alloc[%c0, %61, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%102 = vector.transpose %101, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%103 = vector.insert_strided_slice %102, %100 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%104 = vector.transfer_read %alloc[%c0, %61, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%105 = vector.transpose %104, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%106 = vector.insert_strided_slice %105, %103 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%107 = vector.transfer_read %alloc[%c0, %64, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%108 = vector.transpose %107, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%109 = vector.insert_strided_slice %108, %106 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%110 = vector.transfer_read %alloc[%c0, %64, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%112 = vector.insert_strided_slice %111, %109 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%113 = vector.transfer_read %alloc[%c0, %64, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%114 = vector.transpose %113, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%115 = vector.insert_strided_slice %114, %112 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%116 = vector.transfer_read %alloc[%c0, %64, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%117 = vector.transpose %116, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%118 = vector.insert_strided_slice %117, %115 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%119 = vector.transfer_read %alloc[%c0, %67, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%120 = vector.transpose %119, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%121 = vector.insert_strided_slice %120, %118 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%122 = vector.transfer_read %alloc[%c0, %67, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%123 = vector.transpose %122, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%124 = vector.insert_strided_slice %123, %121 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%125 = vector.transfer_read %alloc[%c0, %67, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%127 = vector.insert_strided_slice %126, %124 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%128 = vector.transfer_read %alloc[%c0, %67, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%129 = vector.transpose %128, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%130 = vector.insert_strided_slice %129, %127 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%131 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
%132 = vector.extract %131[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%133 = vector.extract %78[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%134 = vector.extract %130[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%137 = vector.shape_cast %132 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%138 = amdgpu.mfma %135 * %136 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%139 = vector.extract %78[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%140 = vector.extract %130[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%144 = vector.extract %78[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%145 = vector.extract %130[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%149 = vector.extract %78[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%150 = vector.extract %130[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%151 = vector.shape_cast %149 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%152 = vector.shape_cast %150 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%153 = amdgpu.mfma %151 * %152 + %148 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%154 = vector.shape_cast %153 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%155 = vector.insert %154, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%156 = vector.extract %131[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%157 = vector.extract %130[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%158 = vector.shape_cast %157 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%159 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%160 = amdgpu.mfma %135 * %158 + %159 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%161 = vector.extract %130[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%162 = vector.shape_cast %161 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%163 = amdgpu.mfma %141 * %162 + %160 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%164 = vector.extract %130[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%165 = vector.shape_cast %164 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%166 = amdgpu.mfma %146 * %165 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%167 = vector.extract %130[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%168 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%169 = amdgpu.mfma %151 * %168 + %166 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%170 = vector.shape_cast %169 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%171 = vector.insert %170, %155 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%172 = vector.extract %131[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%173 = vector.extract %130[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%174 = vector.shape_cast %173 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%175 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%176 = amdgpu.mfma %135 * %174 + %175 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %130[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%178 = vector.shape_cast %177 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%179 = amdgpu.mfma %141 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %130[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%181 = vector.shape_cast %180 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%182 = amdgpu.mfma %146 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %130[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%184 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%185 = amdgpu.mfma %151 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%187 = vector.insert %186, %171 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%188 = vector.extract %131[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%189 = vector.extract %130[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%192 = amdgpu.mfma %135 * %190 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %130[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %141 * %194 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %130[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %146 * %197 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %130[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %151 * %200 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.shape_cast %201 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%203 = vector.insert %202, %187 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%204 = vector.extract %131[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%205 = vector.extract %78[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%207 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%208 = amdgpu.mfma %206 * %136 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%209 = vector.extract %78[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%210 = vector.shape_cast %209 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%211 = amdgpu.mfma %210 * %142 + %208 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%212 = vector.extract %78[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%213 = vector.shape_cast %212 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%214 = amdgpu.mfma %213 * %147 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%215 = vector.extract %78[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%217 = amdgpu.mfma %216 * %152 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%218 = vector.shape_cast %217 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%219 = vector.insert %218, %203 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%220 = vector.extract %131[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%221 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%222 = amdgpu.mfma %206 * %158 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %210 * %162 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %213 * %165 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %216 * %168 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%227 = vector.insert %226, %219 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%228 = vector.extract %131[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%229 = vector.shape_cast %228 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%230 = amdgpu.mfma %206 * %174 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%231 = amdgpu.mfma %210 * %178 + %230 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%232 = amdgpu.mfma %213 * %181 + %231 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%233 = amdgpu.mfma %216 * %184 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%234 = vector.shape_cast %233 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%235 = vector.insert %234, %227 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%236 = vector.extract %131[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%237 = vector.shape_cast %236 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%238 = amdgpu.mfma %206 * %190 + %237 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%239 = amdgpu.mfma %210 * %194 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%240 = amdgpu.mfma %213 * %197 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%241 = amdgpu.mfma %216 * %200 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%242 = vector.shape_cast %241 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%243 = vector.insert %242, %235 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%244 = arith.truncf %243 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
scf.yield %244 : vector<1x2x4x1x1x1x1x1x4xf16>
} {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
%subview = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
%11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %12, %subview[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
%14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %15, %subview[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
%17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %18, %subview[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
%20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %21, %subview[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %24, %subview[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %26, %subview[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %28, %subview[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %30, %subview[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%31 = affine.delinearize_index %arg0 into (%c20) : index
%32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
%subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
%36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
%38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
%41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
%42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
%45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
%48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
%58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
%59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
%62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
%65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
%68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
%71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
%80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
%84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
%88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
%92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%98 = vector.transfer_read %alloc[%c0, %61, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%99 = vector.transpose %98, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%100 = vector.insert_strided_slice %99, %97 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%101 = vector.transfer_read %alloc[%c0, %61, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%102 = vector.transpose %101, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%103 = vector.insert_strided_slice %102, %100 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%104 = vector.transfer_read %alloc[%c0, %61, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%105 = vector.transpose %104, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%106 = vector.insert_strided_slice %105, %103 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%107 = vector.transfer_read %alloc[%c0, %64, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%108 = vector.transpose %107, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%109 = vector.insert_strided_slice %108, %106 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%110 = vector.transfer_read %alloc[%c0, %64, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%112 = vector.insert_strided_slice %111, %109 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%113 = vector.transfer_read %alloc[%c0, %64, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%114 = vector.transpose %113, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%115 = vector.insert_strided_slice %114, %112 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%116 = vector.transfer_read %alloc[%c0, %64, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%117 = vector.transpose %116, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%118 = vector.insert_strided_slice %117, %115 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%119 = vector.transfer_read %alloc[%c0, %67, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%120 = vector.transpose %119, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%121 = vector.insert_strided_slice %120, %118 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%122 = vector.transfer_read %alloc[%c0, %67, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%123 = vector.transpose %122, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%124 = vector.insert_strided_slice %123, %121 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%125 = vector.transfer_read %alloc[%c0, %67, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%127 = vector.insert_strided_slice %126, %124 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%128 = vector.transfer_read %alloc[%c0, %67, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%129 = vector.transpose %128, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%130 = vector.insert_strided_slice %129, %127 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%131 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
%132 = vector.extract %131[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%133 = vector.extract %78[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%134 = vector.extract %130[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%137 = vector.shape_cast %132 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%138 = amdgpu.mfma %135 * %136 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%139 = vector.extract %78[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%140 = vector.extract %130[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%144 = vector.extract %78[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%145 = vector.extract %130[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%149 = vector.extract %78[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%150 = vector.extract %130[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%151 = vector.shape_cast %149 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%152 = vector.shape_cast %150 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%153 = amdgpu.mfma %151 * %152 + %148 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%154 = vector.shape_cast %153 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%155 = vector.insert %154, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%156 = vector.extract %131[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%157 = vector.extract %130[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%158 = vector.shape_cast %157 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%159 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%160 = amdgpu.mfma %135 * %158 + %159 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%161 = vector.extract %130[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%162 = vector.shape_cast %161 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%163 = amdgpu.mfma %141 * %162 + %160 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%164 = vector.extract %130[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%165 = vector.shape_cast %164 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%166 = amdgpu.mfma %146 * %165 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%167 = vector.extract %130[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%168 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%169 = amdgpu.mfma %151 * %168 + %166 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%170 = vector.shape_cast %169 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%171 = vector.insert %170, %155 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%172 = vector.extract %131[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%173 = vector.extract %130[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%174 = vector.shape_cast %173 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%175 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%176 = amdgpu.mfma %135 * %174 + %175 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %130[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%178 = vector.shape_cast %177 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%179 = amdgpu.mfma %141 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %130[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%181 = vector.shape_cast %180 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%182 = amdgpu.mfma %146 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %130[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%184 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%185 = amdgpu.mfma %151 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%187 = vector.insert %186, %171 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%188 = vector.extract %131[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%189 = vector.extract %130[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%192 = amdgpu.mfma %135 * %190 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %130[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %141 * %194 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %130[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %146 * %197 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %130[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %151 * %200 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.shape_cast %201 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%203 = vector.insert %202, %187 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%204 = vector.extract %131[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%205 = vector.extract %78[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%207 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%208 = amdgpu.mfma %206 * %136 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%209 = vector.extract %78[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%210 = vector.shape_cast %209 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%211 = amdgpu.mfma %210 * %142 + %208 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%212 = vector.extract %78[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%213 = vector.shape_cast %212 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%214 = amdgpu.mfma %213 * %147 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%215 = vector.extract %78[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
%217 = amdgpu.mfma %216 * %152 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%218 = vector.shape_cast %217 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%219 = vector.insert %218, %203 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%220 = vector.extract %131[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%221 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%222 = amdgpu.mfma %206 * %158 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %210 * %162 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %213 * %165 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %216 * %168 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%227 = vector.insert %226, %219 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%228 = vector.extract %131[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%229 = vector.shape_cast %228 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%230 = amdgpu.mfma %206 * %174 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%231 = amdgpu.mfma %210 * %178 + %230 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%232 = amdgpu.mfma %213 * %181 + %231 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%233 = amdgpu.mfma %216 * %184 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%234 = vector.shape_cast %233 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%235 = vector.insert %234, %227 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%236 = vector.extract %131[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
%237 = vector.shape_cast %236 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
%238 = amdgpu.mfma %206 * %190 + %237 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%239 = amdgpu.mfma %210 * %194 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%240 = amdgpu.mfma %213 * %197 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%241 = amdgpu.mfma %216 * %200 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%242 = vector.shape_cast %241 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
%243 = vector.insert %242, %235 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
%244 = arith.truncf %243 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
scf.yield %244 : vector<1x2x4x1x1x1x1x1x4xf16>
} {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
%subview = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
%11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %12, %subview[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
%14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %15, %subview[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
%17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %18, %subview[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
%20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %21, %subview[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
%23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %24, %subview[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %26, %subview[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %28, %subview[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
%30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
vector.transfer_write %30, %subview[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After GPUDistributeSharedMemoryCopy (iree-codegen-gpu-distribute-shared-memory-copy) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
%c4 = arith.constant 4 : index
%c2 = arith.constant 2 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
%c20 = arith.constant 20 : index
%c1 = arith.constant 1 : index
%cst_2 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
%7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
%31 = affine.delinearize_index %arg0 into (%c20) : index
%32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
%subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
%35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
%36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
%38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
%40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
%41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
%42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
%45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
%48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
%51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
%52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
gpu.barrier
vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
%55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
gpu.barrier
%56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
%57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
%58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
%59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
%62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
%65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
%68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
%71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
%78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
%79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
%80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
%84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
%88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
%92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
%96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
%97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
%98 = vector.transfer_read %alloc[%c0, %61, %83], %
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment