hanhanW/log2 Secret

## log2
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  func.func @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After AutoInputConversionPipeline (iree-auto-input-conversion) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  func.func @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After IREEImportPublic (iree-import-public) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After ConvertMeshToFlow (iree-convert-mesh-to-flow) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = util.call @_batch_matmul(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
  util.func private @_batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = tensor.empty() : tensor<64x968x1280xf16>
    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    util.return %2 : tensor<64x968x1280xf16>
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func private @_batch_matmul(%arg0: tensor<64x968x1280xf16>, %arg1: tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = tensor.empty() : tensor<64x968x1280xf16>
  %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%1 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  util.return %2 : tensor<64x968x1280xf16>
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = util.call @_batch_matmul(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Inliner (inline) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After RemoveZeroExtentTensors (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After DetachElementwiseFromNamedOps (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After EraseUnusedLinalgOperands (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After ExpandTensorShapes (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After DecomposeConcat (iree-global-opt-decompose-concat) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FuseDequantizationMatmul (iree-global-opt-fuse-dequantization-matmul) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FuseHorizontalContractions (iree-global-opt-fuse-horizontal-contractions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After SetEncoding (iree-global-opt-set-encoding) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
#map4 = affine_map<()[s0] -> ((64 ceildiv s0) * s0)>
#map5 = affine_map<()[s0] -> ((968 ceildiv s0) * s0)>
#map6 = affine_map<()[s0] -> ((1280 ceildiv s0) * s0)>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x968x1280xf16, #iree_linalg_ext.encoding<role =  LHS, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
    %3 = affine.apply #map3()[%2#0, %c64]
    %4 = affine.apply #map3()[%2#1, %c968]
    %5 = affine.apply #map3()[%2#2, %c1280]
    %padded = tensor.pad %0 low[0, 0, 0] high[%3, %4, %5] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<64x968x1280xf16> to tensor<?x?x?xf16>
    %6 = iree_linalg_ext.set_encoding %padded : tensor<?x?x?xf16> -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  LHS, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
    %7:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x1280x1280xf16, #iree_linalg_ext.encoding<role =  RHS, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
    %8 = affine.apply #map3()[%7#0, %c64]
    %9 = affine.apply #map3()[%7#1, %c1280]
    %10 = affine.apply #map3()[%7#2, %c1280]
    %padded_0 = tensor.pad %1 low[0, 0, 0] high[%8, %9, %10] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<64x1280x1280xf16> to tensor<?x?x?xf16>
    %11 = iree_linalg_ext.set_encoding %padded_0 : tensor<?x?x?xf16> -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RHS, element_types = [f16, f16, f16], original_type = tensor<64x1280x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
    %12:3 = iree_linalg_ext.upper_bound_tile_size tensor<64x968x1280xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], user_indexing_maps = [#map, #map1, #map2]>> -> index, index, index
    %13 = affine.apply #map4()[%12#0]
    %14 = affine.apply #map5()[%12#1]
    %15 = affine.apply #map6()[%12#2]
    %16 = tensor.empty(%13, %14, %15) : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
    %17 = linalg.fill ins(%cst : f16) outs(%16 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
    %18 = linalg.batch_matmul ins(%6, %11 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  LHS, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RHS, element_types = [f16, f16, f16], original_type = tensor<64x1280x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) outs(%17 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>>
    %19 = iree_linalg_ext.unset_encoding %18 : tensor<?x?x?xf16, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f16, f16, f16], original_type = tensor<64x968x1280xf16>, user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?x?xf16>
    %extracted_slice = tensor.extract_slice %19[0, 0, 0] [64, 968, 1280] [1, 1, 1] : tensor<?x?x?xf16> to tensor<64x968x1280xf16>
    %20 = hal.tensor.export %extracted_slice "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %20 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeEncodingIntoNop (iree-codegen-materialize-encoding-into-nop) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %padded = tensor.pad %0 low[0, 0, 0] high[%c0, %c0, %c0] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst : f16
  } : tensor<64x968x1280xf16> to tensor<?x?x?xf16>
  %padded_0 = tensor.pad %1 low[0, 0, 0] high[%c0, %c0, %c0] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst : f16
  } : tensor<64x1280x1280xf16> to tensor<?x?x?xf16>
  %2 = tensor.empty(%c64, %c968, %c1280) : tensor<?x?x?xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<?x?x?xf16>) -> tensor<?x?x?xf16>
  %4 = linalg.batch_matmul ins(%padded, %padded_0 : tensor<?x?x?xf16>, tensor<?x?x?xf16>) outs(%3 : tensor<?x?x?xf16>) -> tensor<?x?x?xf16>
  %extracted_slice = tensor.extract_slice %4[0, 0, 0] [64, 968, 1280] [1, 1, 1] : tensor<?x?x?xf16> to tensor<64x968x1280xf16>
  %5 = hal.tensor.export %extracted_slice "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After MaterializeHomogeneousEncodings (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After SimplifyPackUnpack (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After GlobalLoopInvariantCodeMotion (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After JitGlobals (iree-consteval-jit-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After InjectTensorTracing (iree-flow-inject-tensor-tracing) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %cst = arith.constant 0.000000e+00 : f16
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = tensor.empty() : tensor<64x968x1280xf16>
    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
}


// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After InterchangeTransposeGenericOps (iree-flow-interchange-transpose-generic-ops) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FormScalarDispatches (iree-flow-form-scalar-dispatches) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
    %6 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.return %6 : tensor<64x968x1280xf16>
  }
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CloneProducersIntoDispatchRegions (iree-flow-clone-producers-into-dispatch-regions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %cst_0 = arith.constant 0.000000e+00 : f16
    %7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.return %8 : tensor<64x968x1280xf16>
  }
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = tensor.empty() : tensor<64x968x1280xf16>
  %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.region -> (tensor<64x968x1280xf16>) {
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %cst_0 = arith.constant 0.000000e+00 : f16
    %7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.return %8 : tensor<64x968x1280xf16>
  }
  %5 = hal.tensor.export %4 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
    %cst = arith.constant 0.000000e+00 : f16
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
    %cst = arith.constant 0.000000e+00 : f16
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
    %cst = arith.constant 0.000000e+00 : f16
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
    %cst = arith.constant 0.000000e+00 : f16
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
      (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
    %cst = arith.constant 0.000000e+00 : f16
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %6 = tensor.empty() : tensor<64x968x1280xf16>
    %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After OutlineDispatchExterns (iree-flow-outline-dispatch-externs) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch.workgroups(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16> =
        (%arg2: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
      %cst = arith.constant 0.000000e+00 : f16
      %4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %6 = tensor.empty() : tensor<64x968x1280xf16>
      %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %8 = linalg.batch_matmul ins(%4, %5 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%7 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %8, %arg4, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatches (iree-flow-annotate-dispatches) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @batch_matmul_dispatch_0 {
  flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
      %cst = arith.constant 0.000000e+00 : f16
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %2 = tensor.empty() : tensor<64x968x1280xf16>
      %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @batch_matmul_dispatch_0 {
  flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
      %cst = arith.constant 0.000000e+00 : f16
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %2 = tensor.empty() : tensor<64x968x1280xf16>
      %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @batch_matmul_dispatch_0 {
  flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
      %cst = arith.constant 0.000000e+00 : f16
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %2 = tensor.empty() : tensor<64x968x1280xf16>
      %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After InjectTensorTracing (iree-flow-inject-tensor-tracing) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineConstants (iree-util-outline-constants) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
  %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
  %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  flow.executable private @batch_matmul_dispatch_0 {
    flow.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %2 = tensor.empty() : tensor<64x968x1280xf16>
        %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %4 = linalg.batch_matmul ins(%0, %1 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%3 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x968x1280xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<64x1280x1280xf16>
    %2 = flow.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0, %1) : (tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) -> tensor<64x968x1280xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<64x968x1280xf16> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %cst = arith.constant 0.000000e+00 : f16
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    %c64 = arith.constant 64 : index
    %c968 = arith.constant 968 : index
    %c1280 = arith.constant 1280 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %element_type_f16_0 = hal.element_type<f16> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c64_2 = arith.constant 64 : index
    %c1280_3 = arith.constant 1280 : index
    %c1280_4 = arith.constant 1280 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64_2, %c1280_3, %c1280_4]) type(%element_type_f16_0) encoding(%dense_row_major_1)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %c0 = arith.constant 0 : index
    %6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
    util.return %9 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %cst = arith.constant 0.000000e+00 : f16
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    %c64 = arith.constant 64 : index
    %c968 = arith.constant 968 : index
    %c1280 = arith.constant 1280 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %element_type_f16_0 = hal.element_type<f16> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c64_2 = arith.constant 64 : index
    %c1280_3 = arith.constant 1280 : index
    %c1280_4 = arith.constant 1280 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64_2, %c1280_3, %c1280_4]) type(%element_type_f16_0) encoding(%dense_row_major_1)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %c0 = arith.constant 0 : index
    %6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
    util.return %9 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
  %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
  %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
  %element_type_f16_0 = hal.element_type<f16> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16_0) encoding(%dense_row_major_1)
  %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
  %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
  %7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
  %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
  %9 = stream.tensor.export %8 : tensor<64x968x1280xf16> in !stream.resource<external>{%6} -> !hal.buffer_view
  util.return %9 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
  %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
  %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
  %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
  %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
  %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
  %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
  %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
  %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof tensor<64x968x1280xf16> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %3 = stream.tensor.sizeof tensor<64x1280x1280xf16> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<64x968x1280xf16> in !stream.resource<external>{%0} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @batch_matmul_dispatch_0 {
  stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f16
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %5 = tensor.empty() : tensor<64x968x1280xf16>
      %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c158597120} -> !stream.resource<*>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c209715200} -> !stream.resource<*>{%c209715200}
  %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%1[%c0 to %c158597120 for %c158597120], %3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<*>{%c158597120}, !stream.resource<*>{%c209715200}) -> !stream.resource<*>{%c158597120}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c158597120} -> !stream.resource<external>{%c158597120}
  %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
}

// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%0[%c0 to %c158597120 for %c158597120], %1[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    stream.yield %4 : !stream.resource<external>{%c158597120}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    stream.yield %4 : !stream.resource<external>{%c158597120}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.timepoint.immediate => !stream.timepoint
    %3 = stream.timepoint.immediate => !stream.timepoint
    %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %7 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %2 = stream.timepoint.immediate => !stream.timepoint
    %3 = stream.timepoint.immediate => !stream.timepoint
    %4 = stream.timepoint.join max(%2, %3) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %7 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %7 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %6 = stream.tensor.export %5 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    stream.yield %4 : !stream.resource<external>{%c158597120}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    stream.yield %4 : !stream.resource<external>{%c158597120}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
    %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
    stream.yield %4 : !stream.resource<external>{%c158597120}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
  %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %4 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %4 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %4 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %4 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120} {
      %4 = stream.async.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg2[%c0 to %c158597120 for %c158597120], %arg3[%c0 to %c209715200 for %c209715200]) : (!stream.resource<external>{%c158597120}, !stream.resource<external>{%c209715200}) -> !stream.resource<external>{%c158597120}
      stream.yield %4 : !stream.resource<external>{%c158597120}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c158597120}
    %3 = stream.tensor.export %2 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %c0_0 = arith.constant 0 : index
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %c0_0 = arith.constant 0 : index
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %c0_0 = arith.constant 0 : index
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm], iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0, %c0, %c0 : index, index, index) {
        ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0, %c0, %c0 : index, index, index) {
        ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %c32_i64 = arith.constant 32 : i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %c32_i64_0 = arith.constant 32 : i64
        %7 = arith.shli %6, %c32_i64_0 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %c32_i64_1 = arith.constant 32 : i64
        %12 = arith.shli %11, %c32_i64_1 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %c0_i64 = arith.constant 0 : i64
    %c0_i32 = arith.constant 0 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i64_1 = arith.constant 0 : i64
    %c0_i32_2 = arith.constant 0 : i32
    %c0_i64_3 = arith.constant 0 : i64
    %c0_i32_4 = arith.constant 0 : i32
    %c32_i64_5 = arith.constant 32 : i64
    %c0_i64_6 = arith.constant 0 : i64
    %c0_i32_7 = arith.constant 0 : i32
    %c0_i64_8 = arith.constant 0 : i64
    %c0_i32_9 = arith.constant 0 : i32
    %c32_i64_10 = arith.constant 32 : i64
    %c0_i64_11 = arith.constant 0 : i64
    %c0_i32_12 = arith.constant 0 : i32
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12 : i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0_0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0_0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f16
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f16
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f16
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f16
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %cst = arith.constant 0.000000e+00 : f16
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %c0_i32 : i32 to i64
        %1 = arith.extui %c0_i32 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %c0_i32 : i32 to i64
        %6 = arith.extui %c0_i32 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %c0_i32 : i32 to i64
        %11 = arith.extui %c0_i32 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %16 = stream.binding.subspan %arg1[%9] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %17 = stream.binding.subspan %arg2[%14] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %20 = tensor.empty() : tensor<64x968x1280xf16>
        %21 = linalg.fill ins(%cst : f16) outs(%20 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %22 = linalg.batch_matmul ins(%18, %19 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%21 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %22, %17, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
  %c209715200 = arith.constant 209715200 : index
  %c158597120 = arith.constant 158597120 : index
  %c0 = arith.constant 0 : index
  %c1280 = arith.constant 1280 : index
  %c968 = arith.constant 968 : index
  %c64 = arith.constant 64 : index
  %element_type_f16 = hal.element_type<f16> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
  %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
  %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
    stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
      ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
      ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
      wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
    }
  } => !stream.timepoint
  %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
  %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  stream.executable private @batch_matmul_dispatch_0 {
    stream.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  hal.executable private @batch_matmul_dispatch_0 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
          %cst = arith.constant 0.000000e+00 : f16
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
          %5 = tensor.empty() : tensor<64x968x1280xf16>
          %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
          %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
          flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
          return
        }
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@rocm_hsaco_fb::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_rocm = #hal.device.target<"rocm", {legacy_sync}, [#executable_target_rocm_hsaco_fb]>
module attributes {hal.device.targets = [#device_target_rocm]} {
  hal.executable private @batch_matmul_dispatch_0 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
          %cst = arith.constant 0.000000e+00 : f16
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
          %5 = tensor.empty() : tensor<64x968x1280xf16>
          %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
          %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
          flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
          return
        }
      }
    }
  }
  util.func public @batch_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @batch_matmul(%input0: tensor<64x968x1280xf16>, %input1: tensor<64x1280x1280xf16>) -> (%output0: tensor<64x968x1280xf16>)"}} {
    %c209715200 = arith.constant 209715200 : index
    %c158597120 = arith.constant 158597120 : index
    %c0 = arith.constant 0 : index
    %c1280 = arith.constant 1280 : index
    %c968 = arith.constant 968 : index
    %c64 = arith.constant 64 : index
    %element_type_f16 = hal.element_type<f16> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c64, %c968, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c64, %c1280, %c1280]) type(%element_type_f16) encoding(%dense_row_major)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<64x1280x1280xf16> in !stream.resource<external>{%c209715200}
    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c158597120} => !stream.timepoint
    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c158597120}, %1 as %arg3: !stream.resource<external>{%c209715200}, %result as %arg4: !stream.resource<external>{%c158597120}) {
      stream.cmd.dispatch @batch_matmul_dispatch_0::@rocm_hsaco_fb::@batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 {
        ro %arg2[%c0 for %c158597120] : !stream.resource<external>{%c158597120},
        ro %arg3[%c0 for %c209715200] : !stream.resource<external>{%c209715200},
        wo %arg4[%c0 for %c158597120] : !stream.resource<external>{%c158597120}
      }
    } => !stream.timepoint
    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c158597120}
    %4 = stream.tensor.export %3 : tensor<64x968x1280xf16> in !stream.resource<external>{%c158597120} -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After GPUGeneralizeNamedOps (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
  %5 = tensor.empty() : tensor<64x968x1280xf16>
  %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
  %5 = tensor.empty() : tensor<64x968x1280xf16>
  %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %5 = tensor.empty() : tensor<64x968x1280xf16>
    %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
    %5 = tensor.empty() : tensor<64x968x1280xf16>
    %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
  %5 = tensor.empty() : tensor<64x968x1280xf16>
  %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
  hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
      %cst = arith.constant 0.000000e+00 : f16
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %5 = tensor.empty() : tensor<64x968x1280xf16>
      %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

chosen MMA schedule:
  intrinsic (M, N, K) = (16, 16, 16)
  subgroup count (M, N) = (2, 2)
  subgroup tile count (M, N, K) = (2, 4, 4)
// -----// IR Dump After LLVMGPUSelectLoweringStrategy (iree-llvmgpu-select-lowering-strategy) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
  hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
      %cst = arith.constant 0.000000e+00 : f16
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %5 = tensor.empty() : tensor<64x968x1280xf16>
      %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
  hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
      %cst = arith.constant 0.000000e+00 : f16
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
      %5 = tensor.empty() : tensor<64x968x1280xf16>
      %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      %7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable private @batch_matmul_dispatch_0 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
    hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
        %cst = arith.constant 0.000000e+00 : f16
        %c0 = arith.constant 0 : index
        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<64x968x1280xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
        %5 = tensor.empty() : tensor<64x968x1280xf16>
        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        %7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%3, %4 : tensor<64x968x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x968x1280xf16>) -> tensor<64x968x1280xf16>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1280], strides = [1, 1, 1] : tensor<64x968x1280xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
        return
      }
    }
  }
}

// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
  hal.executable.export public @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4>}>, workgroup_size = [128 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device):
    %c10 = arith.constant 10 : index
    %c16 = arith.constant 16 : index
    %c64 = arith.constant 64 : index
    hal.return %c10, %c16, %c64 : index, index, index
  }
  builtin.module {
    func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
      %c128 = arith.constant 128 : index
      %cst = arith.constant 0.000000e+00 : f16
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %3 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
      %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
      %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %4, 0], sizes = [1, %3, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
      %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
      %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %6], sizes = [1, 1280, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x?xf16>
      %8 = tensor.empty(%3) : tensor<1x?x128xf16>
      %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%8 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
      %cast = tensor.cast %7 : tensor<1x1280x?xf16> to tensor<1x1280x128xf16>
      %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%5, %cast : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
      %cast_0 = tensor.cast %10 : tensor<1x?x128xf16> to tensor<1x?x?xf16>
      %11 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
      %12 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
      flow.dispatch.tensor.store %cast_0, %2, offsets = [%workgroup_id_z, %11, %12], sizes = [1, %3, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
      return
    }
  }
}

// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c128 = arith.constant 128 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x?xf16>
  %cast = tensor.cast %6 : tensor<1x?x?xf16> to tensor<1x?x128xf16>
  %workgroup_id_x_0 = hal.interface.workgroup.id[0] : index
  %workgroup_id_y_1 = hal.interface.workgroup.id[1] : index
  %workgroup_id_z_2 = hal.interface.workgroup.id[2] : index
  %7 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y_1]
  %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
  %9 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z_2, %8, 0], sizes = [1, %7, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
  %11 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z_2, 0, %10], sizes = [1, 1280, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x?xf16>
  %12 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%cast : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
  %cast_3 = tensor.cast %11 : tensor<1x1280x?xf16> to tensor<1x1280x128xf16>
  %13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%9, %cast_3 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%12 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
  %cast_4 = tensor.cast %13 : tensor<1x?x128xf16> to tensor<1x?x?xf16>
  %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
  flow.dispatch.tensor.store %cast_4, %2, offsets = [%workgroup_id_z_2, %14, %15], sizes = [1, %7, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %workgroup_id_x_0 = hal.interface.workgroup.id[0] : index
    %workgroup_id_y_1 = hal.interface.workgroup.id[1] : index
    %workgroup_id_z_2 = hal.interface.workgroup.id[2] : index
    %7 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y_1]
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
    %9 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z_2, %8, 0], sizes = [1, %7, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
    %11 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z_2, 0, %10], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %12 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    %13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%9, %11 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%12 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y_1]
    %15 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x_0]
    flow.dispatch.tensor.store %13, %2, offsets = [%workgroup_id_z_2, %14, %15], sizes = [1, %7, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After ReorderWorkgroups (iree-codegen-reorder-workgroups) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
  %7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
  %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
  flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %7 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %8 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%cst : f16) outs(%6 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
    flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

padding parallel dims
candidate: %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
[linalg-padding]: Start rewriteAsPaddedOp : %10 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%7, %8 : tensor<1x?x1280xf16>, tensor<1x1280x128xf16>) outs(%9 : tensor<1x?x128xf16>) -> tensor<1x?x128xf16>
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----could not compute a bounding box for padding
[linalg-padding]: ----Fallback to use pre-configured smallest static bounds
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x1280xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 128
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----new dim size: 1
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----could not compute a bounding box for padding
[linalg-padding]: ----Fallback to use pre-configured smallest static bounds
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 128
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x128xf16>[linalg-padding]: --cloned padded op: %13 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %8 : tensor<1x64x1280xf16>, tensor<1x1280x128xf16>) outs(%padded_4 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
// -----// IR Dump After LLVMGPUPromoteMatmulToFitMMA (iree-llvmgpu-promote-matmul-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %dim = tensor.dim %6, %c1 : tensor<1x?x1280xf16>
  %extracted_slice = tensor.extract_slice %6[0, 0, 0] [1, %dim, 1280] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
  %padded = tensor.pad %extracted_slice low[0, 0, 0] high[0, %8, 0] {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):
    tensor.yield %cst : f16
  } : tensor<1x?x1280xf16> to tensor<1x64x1280xf16>
  %9 = tensor.empty() : tensor<1x64x128xf16>
  %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %7 : tensor<1x64x1280xf16>, tensor<1x1280x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %extracted_slice_0 = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice_0, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After GPUTensorTileToSerialLoops (iree-codegen-gpu-tensor-tile-to-serial-loops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %dim = tensor.dim %6, %c1 : tensor<1x?x1280xf16>
  %extracted_slice = tensor.extract_slice %6[0, 0, 0] [1, %dim, 1280] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
  %padded = tensor.pad %extracted_slice low[0, 0, 0] high[0, %8, 0] {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):
    tensor.yield %cst : f16
  } : tensor<1x?x1280xf16> to tensor<1x64x1280xf16>
  %9 = tensor.empty() : tensor<1x64x128xf16>
  %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
    %12:2 = affine.delinearize_index %arg0 into (%c1, %c20) : index, index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12#1]
    %extracted_slice_1 = tensor.extract_slice %padded[%12#0, 0, %13] [1, 64, 64] [1, 1, 1] : tensor<1x64x1280xf16> to tensor<1x64x64xf16>
    %extracted_slice_2 = tensor.extract_slice %7[%12#0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_3 = tensor.extract_slice %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x64x128xf16>
    %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    %inserted_slice = tensor.insert_slice %14 into %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> into tensor<1x64x128xf16>
    scf.yield %inserted_slice : tensor<1x64x128xf16>
  }
  %extracted_slice_0 = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice_0, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

padding reduction dims
candidate: %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
[linalg-padding]: Start rewriteAsPaddedOp : %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x64xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----new dim size: 64
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --SUCCESS, makeComposedPadHighOp with type: tensor<1x64x128xf16>[linalg-padding]: --compute padded size for dim 0
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 1
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --compute padded size for dim 2
[linalg-padding]: ----dim does not require padding, SKIP
[linalg-padding]: --cloned padded op: %15 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded_5, %padded_7 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_3 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
// -----// IR Dump After LLVMGPUPromoteMatmulToFitMMA (iree-llvmgpu-promote-matmul-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
  %9 = tensor.empty() : tensor<1x64x128xf16>
  %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
    %12:2 = affine.delinearize_index %arg0 into (%c1, %c20) : index, index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12#1]
    %extracted_slice_0 = tensor.extract_slice %7[%12#0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_1 = tensor.extract_slice %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_2 = tensor.extract_slice %6[%12#0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %padded = tensor.pad %extracted_slice_2 nofold low[0, 0, 0] high[0, %8, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x?x64xf16> to tensor<1x64x64xf16>
    %padded_3 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x64x128xf16> to tensor<1x64x128xf16>
    %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_3 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%extracted_slice_1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    %inserted_slice = tensor.insert_slice %14 into %arg1[%12#0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16> into tensor<1x64x128xf16>
    scf.yield %inserted_slice : tensor<1x64x128xf16>
  }
  %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
    %9 = tensor.empty() : tensor<1x64x128xf16>
    %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
      %12 = affine.delinearize_index %arg0 into (%c20) : index
      %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
      %extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
      %extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
      %padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
      ^bb0(%arg2: index, %arg3: index, %arg4: index):
        tensor.yield %cst : f16
      } : tensor<1x?x64xf16> to tensor<1x64x64xf16>
      %padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
      ^bb0(%arg2: index, %arg3: index, %arg4: index):
        tensor.yield %cst : f16
      } : tensor<1x64x128xf16> to tensor<1x64x128xf16>
      %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
      scf.yield %14 : tensor<1x64x128xf16>
    }
    %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
    flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
    %9 = tensor.empty() : tensor<1x64x128xf16>
    %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
      %12 = affine.delinearize_index %arg0 into (%c20) : index
      %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
      %extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
      %extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
      %padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
      ^bb0(%arg2: index, %arg3: index, %arg4: index):
        tensor.yield %cst : f16
      } : tensor<1x?x64xf16> to tensor<1x64x64xf16>
      %padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
      ^bb0(%arg2: index, %arg3: index, %arg4: index):
        tensor.yield %cst : f16
      } : tensor<1x64x128xf16> to tensor<1x64x128xf16>
      %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
      scf.yield %14 : tensor<1x64x128xf16>
    }
    %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
    flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After OptimizeTensorInsertExtractSlices (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
  %9 = tensor.empty() : tensor<1x64x128xf16>
  %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
    %12 = affine.delinearize_index %arg0 into (%c20) : index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
    %extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x?x64xf16> to tensor<1x64x64xf16>
    %padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x64x128xf16> to tensor<1x64x128xf16>
    %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    scf.yield %14 : tensor<1x64x128xf16>
  }
  %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After DecomposeConvolutionToLowerDimOps (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = affine.apply affine_map<()[s0] -> (-s0 + 64)>()[%5]
  %9 = tensor.empty() : tensor<1x64x128xf16>
  %10 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
  %11 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %10) -> (tensor<1x64x128xf16>) {
    %12 = affine.delinearize_index %arg0 into (%c20) : index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
    %extracted_slice_0 = tensor.extract_slice %7[0, %13, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %padded = tensor.pad %extracted_slice_1 nofold low[0, 0, 0] high[0, %8, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x?x64xf16> to tensor<1x64x64xf16>
    %padded_2 = tensor.pad %extracted_slice_0 nofold low[0, 0, 0] high[0, 0, 0] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<1x64x128xf16> to tensor<1x64x128xf16>
    %14 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128, 64]]>} ins(%padded, %padded_2 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%arg1 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16>
    scf.yield %14 : tensor<1x64x128xf16>
  }
  %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = tensor.empty() : tensor<1x64x128xf16>
  %9 = vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
  %10 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %9) -> (tensor<1x64x128xf16>) {
    %11 = affine.delinearize_index %arg0 into (%c20) : index
    %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
    %extracted_slice_1 = tensor.extract_slice %7[0, %12, 0] [1, 64, 128] [1, 1, 1] : tensor<1x1280x128xf16> to tensor<1x64x128xf16>
    %extracted_slice_2 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %13 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %14 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16>, vector<1x64x128xf16>
    %15 = vector.transfer_read %arg1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16>, vector<1x64x128xf16>
    %16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %15 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    %17 = vector.transfer_write %16, %arg1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
    scf.yield %17 : tensor<1x64x128xf16>
  }
  %extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After OptimizeTensorInsertExtractSlices (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = tensor.empty() : tensor<1x64x128xf16>
  %9 = vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
  %10:2 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %9, %arg2 = %cst) -> (tensor<1x64x128xf16>, vector<1x64x128xf16>) {
    %12 = affine.delinearize_index %arg0 into (%c20) : index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%12]
    %extracted_slice_1 = tensor.extract_slice %6[0, 0, %13] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %14 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %15 = vector.transfer_read %7[%c0, %13, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
    %16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %arg2 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %arg1, %16 : tensor<1x64x128xf16>, vector<1x64x128xf16>
  }
  %11 = vector.transfer_write %10#1, %10#0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
  %extracted_slice = tensor.extract_slice %11[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = tensor.empty() : tensor<1x64x128xf16>
  %9 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %11 = affine.delinearize_index %arg0 into (%c20) : index
    %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
    %extracted_slice_1 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %13 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
    %15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %15 : vector<1x64x128xf16>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
  %extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = tensor.empty() : tensor<1x64x128xf16>
  %9 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %11 = affine.delinearize_index %arg0 into (%c20) : index
    %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
    %extracted_slice_1 = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %13 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
    %15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %15 : vector<1x64x128xf16>
  }
  %10 = vector.transfer_write %9, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16>
  %extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, %5, 128] [1, 1, 1] : tensor<1x64x128xf16> to tensor<1x?x128xf16>
  flow.dispatch.tensor.store %extracted_slice, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After LLVMGPUFoldExtractSliceIntoXferWrite (iree-llvmgpu-fold-extract-slice-into-xfer-write) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %11 = affine.delinearize_index %arg0 into (%c20) : index
    %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
    %extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
    %15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %15 : vector<1x64x128xf16>
  }
  %9 = tensor.empty(%5) : tensor<1x?x128xf16>
  %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
  flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After GPUVectorAlloc (iree-codegen-gpu-vector-alloc) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
  %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
  %8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %11 = affine.delinearize_index %arg0 into (%c20) : index
    %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
    %extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
    %13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
    %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
    gpu.barrier
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
    %c0_1 = arith.constant 0 : index
    %16 = vector.transfer_write %13, %15[%c0_1, %c0_1, %c0_1] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
    %17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
    %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
    %c0_2 = arith.constant 0 : index
    %19 = vector.transfer_write %14, %18[%c0_2, %c0_2, %c0_2] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
    %20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %c0_3 = arith.constant 0 : index
    %cst_4 = arith.constant 0.000000e+00 : f16
    %21 = vector.transfer_read %17[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %c0_5 = arith.constant 0 : index
    %cst_6 = arith.constant 0.000000e+00 : f16
    %22 = vector.transfer_read %20[%c0_5, %c0_5, %c0_5], %cst_6 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %23 : vector<1x64x128xf16>
  }
  %9 = tensor.empty(%5) : tensor<1x?x128xf16>
  %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
  flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
  return
}

// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %11 = affine.delinearize_index %arg0 into (%c20) : index
      %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
      %extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
      %13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
      %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
      gpu.barrier
      %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %16 = vector.transfer_write %13, %15[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      %19 = vector.transfer_write %14, %18[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      %20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %21 = vector.transfer_read %17[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %22 = vector.transfer_read %20[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %23 : vector<1x64x128xf16>
    }
    %9 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
    flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %6 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_z, %3, 0], sizes = [1, %5, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x968x1280xf16>> -> tensor<1x?x1280xf16>
    %7 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_z, 0, %4], sizes = [1, 1280, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<1x1280x128xf16>
    %8 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %11 = affine.delinearize_index %arg0 into (%c20) : index
      %12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%11]
      %extracted_slice = tensor.extract_slice %6[0, 0, %12] [1, %5, 64] [1, 1, 1] : tensor<1x?x1280xf16> to tensor<1x?x64xf16>
      %13 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x64xf16>, vector<1x64x64xf16>
      %14 = vector.transfer_read %7[%c0, %12, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1280x128xf16>, vector<1x64x128xf16>
      gpu.barrier
      %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %16 = vector.transfer_write %13, %15[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %17 = bufferization.materialize_in_destination %16 in %16 : (tensor<1x64x64xf16, #gpu.address_space<workgroup>>, tensor<1x64x64xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x64xf16, #gpu.address_space<workgroup>>
      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      %19 = vector.transfer_write %14, %18[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      %20 = bufferization.materialize_in_destination %19 in %19 : (tensor<1x64x128xf16, #gpu.address_space<workgroup>>, tensor<1x64x128xf16, #gpu.address_space<workgroup>>) -> tensor<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %21 = vector.transfer_read %17[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %22 = vector.transfer_read %20[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %22, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %23 : vector<1x64x128xf16>
    }
    %9 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>> -> tensor<1x?x128xf16>
    %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, tensor<1x?x128xf16>
    flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_z, %3, %4], sizes = [1, %5, 128], strides = [1, 1, 1] : tensor<1x?x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x968x1280xf16>>
    return
  }
}

// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %alloc, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %alloc_5, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, #gpu.address_space<workgroup>> to memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %alloc, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %alloc_5, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, #gpu.address_space<workgroup>> to memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %alloc_5 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %10, %alloc_5[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.copy %subview_2, %subview_3 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.copy %subview_2, %subview_2 : memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
}

// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_0 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
      %7 = affine.delinearize_index %arg0 into (%c20) : index
      %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
      %subview_3 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %9 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
      %10 = vector.transfer_read %subview_1[%c0, %8, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
      gpu.barrier
      %alloc = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %9, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_4 = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %10, %alloc_4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %11 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
      %12 = vector.transfer_read %alloc_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
      %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
      scf.yield %13 : vector<1x64x128xf16>
    }
    %subview_2 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    vector.transfer_write %6, %subview_2[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    return
  }
}

// -----// IR Dump After HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_1 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_1 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    vector.transfer_write %9, %alloc_0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc_0[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_0 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After LLVMGPUNormalizeContractMaps (iree-llvmgpu-normalize-contract-maps) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %cst = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_1 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst_1 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    vector.transfer_write %9, %alloc_0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc_0[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %arg1 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf16>
    scf.yield %13 : vector<1x64x128xf16>
  }
  %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_0 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After LLVMGPUCastTypeToFitMMA (iree-llvmgpu-cast-type-to-fit-mma) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c20 = arith.constant 20 : index
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x64x128xf16>
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %5 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %0[%workgroup_id_z, %3, 0] [1, %5, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_2 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %6 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_0) -> (vector<1x64x128xf16>) {
    %7 = affine.delinearize_index %arg0 into (%c20) : index
    %8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%7]
    %subview_4 = memref.subview %subview[0, 0, %8] [1, %5, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x64xf16>
    %10 = vector.transfer_read %subview_2[%c0, %8, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x64x128xf16>
    gpu.barrier
    vector.transfer_write %9, %alloc_1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x64xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %10, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x128xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %11 = vector.transfer_read %alloc_1[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x64x64xf16>
    %12 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x64x128xf16>
    %13 = arith.extf %arg1 : vector<1x64x128xf16> to vector<1x64x128xf32>
    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %13 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf32>
    %15 = arith.truncf %14 : vector<1x64x128xf32> to vector<1x64x128xf16>
    scf.yield %15 : vector<1x64x128xf16>
  }
  %subview_3 = memref.subview %2[%workgroup_id_z, %3, %4] [1, %5, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  vector.transfer_write %6, %subview_3[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x128xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_1 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After LLVMGPUVectorDistribute (iree-llvmgpu-vector-distribute) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
  %c4 = arith.constant 4 : index
  %c2 = arith.constant 2 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_2 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
    %41 = affine.delinearize_index %arg0 into (%c20) : index
    %42 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%41]
    %subview_6 = memref.subview %subview[0, 0, %42] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %43:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c32, %c8) : index, index, index, index, index, index
    %44 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#5]
    %45 = vector.transfer_read %subview_6[%c0, %43#4, %44], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %46 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%43#4]
    %47 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#5]
    %48 = vector.transfer_read %subview_6[%c0, %46, %47], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %49:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c16, %c16) : index, index, index, index, index, index
    %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%49#4, %41]
    %51 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
    %52 = vector.transfer_read %subview_4[%c0, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%49#4, %41]
    %54 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
    %55 = vector.transfer_read %subview_4[%c0, %53, %54], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%49#4, %41]
    %57 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
    %58 = vector.transfer_read %subview_4[%c0, %56, %57], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%49#4, %41]
    %60 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#5]
    %61 = vector.transfer_read %subview_4[%c0, %59, %60], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    gpu.barrier
    %62:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c32, %c8) : index, index, index, index, index, index
    %63 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#5]
    vector.transfer_write %45, %alloc_3[%c0, %62#4, %63] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    %64 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%62#4]
    %65 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#5]
    vector.transfer_write %48, %alloc_3[%c0, %64, %65] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %66:6 = affine.delinearize_index %0 into (%c1, %c1, %c1, %c1, %c16, %c16) : index, index, index, index, index, index
    %67 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
    vector.transfer_write %52, %alloc[%c0, %66#4, %67] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %68 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%66#4]
    %69 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
    vector.transfer_write %55, %alloc[%c0, %68, %69] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %70 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%66#4]
    %71 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
    vector.transfer_write %58, %alloc[%c0, %70, %71] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %72 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%66#4]
    %73 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#5]
    vector.transfer_write %61, %alloc[%c0, %72, %73] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %74:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
    %75 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
    %76 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#5]
    %77 = vector.transfer_read %alloc_3[%c0, %75, %76], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %78 = vector.insert_strided_slice %77, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %79 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
    %80 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#5]
    %81 = vector.transfer_read %alloc_3[%c0, %79, %80], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %82 = vector.insert_strided_slice %81, %78 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %83 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
    %84 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#5]
    %85 = vector.transfer_read %alloc_3[%c0, %83, %84], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %87 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#1, %74#6]
    %88 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#5]
    %89 = vector.transfer_read %alloc_3[%c0, %87, %88], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %91 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
    %92 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#5]
    %93 = vector.transfer_read %alloc_3[%c0, %91, %92], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %94 = vector.insert_strided_slice %93, %90 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %95 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
    %96 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#5]
    %97 = vector.transfer_read %alloc_3[%c0, %95, %96], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %98 = vector.insert_strided_slice %97, %94 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %99 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
    %100 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#5]
    %101 = vector.transfer_read %alloc_3[%c0, %99, %100], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %102 = vector.insert_strided_slice %101, %98 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %103 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#1, %74#6]
    %104 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#5]
    %105 = vector.transfer_read %alloc_3[%c0, %103, %104], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %106 = vector.insert_strided_slice %105, %102 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %107:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
    %108 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
    %109 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
    %110 = vector.transfer_read %alloc[%c0, %108, %109], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %112 = vector.insert_strided_slice %111, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %113 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
    %114 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
    %115 = vector.transfer_read %alloc[%c0, %113, %114], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %116 = vector.transpose %115, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %117 = vector.insert_strided_slice %116, %112 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %118 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
    %119 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
    %120 = vector.transfer_read %alloc[%c0, %118, %119], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %122 = vector.insert_strided_slice %121, %117 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %123 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#5]
    %124 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
    %125 = vector.transfer_read %alloc[%c0, %123, %124], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %127 = vector.insert_strided_slice %126, %122 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %128 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
    %129 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
    %130 = vector.transfer_read %alloc[%c0, %128, %129], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %131 = vector.transpose %130, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %132 = vector.insert_strided_slice %131, %127 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %133 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
    %134 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
    %135 = vector.transfer_read %alloc[%c0, %133, %134], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %136 = vector.transpose %135, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %137 = vector.insert_strided_slice %136, %132 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %138 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
    %139 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
    %140 = vector.transfer_read %alloc[%c0, %138, %139], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %141 = vector.transpose %140, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %142 = vector.insert_strided_slice %141, %137 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %143 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#5]
    %144 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
    %145 = vector.transfer_read %alloc[%c0, %143, %144], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %146 = vector.transpose %145, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %147 = vector.insert_strided_slice %146, %142 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %148 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
    %149 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
    %150 = vector.transfer_read %alloc[%c0, %148, %149], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %151 = vector.transpose %150, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %152 = vector.insert_strided_slice %151, %147 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %153 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
    %154 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
    %155 = vector.transfer_read %alloc[%c0, %153, %154], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %156 = vector.transpose %155, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %157 = vector.insert_strided_slice %156, %152 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %158 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
    %159 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
    %160 = vector.transfer_read %alloc[%c0, %158, %159], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %161 = vector.transpose %160, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %162 = vector.insert_strided_slice %161, %157 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %163 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#5]
    %164 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
    %165 = vector.transfer_read %alloc[%c0, %163, %164], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %166 = vector.transpose %165, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %167 = vector.insert_strided_slice %166, %162 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %168 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
    %169 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#2, %107#6]
    %170 = vector.transfer_read %alloc[%c0, %168, %169], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %171 = vector.transpose %170, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %172 = vector.insert_strided_slice %171, %167 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %173 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
    %174 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#2, %107#6]
    %175 = vector.transfer_read %alloc[%c0, %173, %174], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %176 = vector.transpose %175, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %177 = vector.insert_strided_slice %176, %172 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %178 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
    %179 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#2, %107#6]
    %180 = vector.transfer_read %alloc[%c0, %178, %179], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %181 = vector.transpose %180, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %182 = vector.insert_strided_slice %181, %177 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %183 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#5]
    %184 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#2, %107#6]
    %185 = vector.transfer_read %alloc[%c0, %183, %184], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %186 = vector.transpose %185, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %187 = vector.insert_strided_slice %186, %182 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %188 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
    %189 = vector.extract %188[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %190 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %191 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %192 = vector.shape_cast %190 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %193 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %194 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %195 = amdgpu.mfma %192 * %193 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %196 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %197 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %198 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %199 = vector.shape_cast %197 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %201 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %202 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %203 = vector.shape_cast %201 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %204 = vector.shape_cast %202 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %206 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %207 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %208 = vector.shape_cast %206 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %209 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %211 = vector.shape_cast %210 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %212 = vector.insert %211, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %213 = vector.extract %188[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %214 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %215 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %216 = vector.shape_cast %214 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %217 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %218 = vector.shape_cast %213 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %219 = amdgpu.mfma %216 * %217 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %220 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %221 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %222 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %223 = vector.shape_cast %221 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %224 = amdgpu.mfma %222 * %223 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %225 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %226 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %227 = vector.shape_cast %225 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %228 = vector.shape_cast %226 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %229 = amdgpu.mfma %227 * %228 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %230 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %231 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %232 = vector.shape_cast %230 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %233 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %234 = amdgpu.mfma %232 * %233 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %235 = vector.shape_cast %234 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %236 = vector.insert %235, %212 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %237 = vector.extract %188[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %238 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %239 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %240 = vector.shape_cast %238 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %241 = vector.shape_cast %239 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %242 = vector.shape_cast %237 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %243 = amdgpu.mfma %240 * %241 + %242 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %244 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %245 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %246 = vector.shape_cast %244 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %247 = vector.shape_cast %245 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %248 = amdgpu.mfma %246 * %247 + %243 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %249 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %250 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %251 = vector.shape_cast %249 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %252 = vector.shape_cast %250 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %253 = amdgpu.mfma %251 * %252 + %248 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %254 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %255 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %256 = vector.shape_cast %254 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %257 = vector.shape_cast %255 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %258 = amdgpu.mfma %256 * %257 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %259 = vector.shape_cast %258 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %260 = vector.insert %259, %236 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %261 = vector.extract %188[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %262 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %263 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %264 = vector.shape_cast %262 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %265 = vector.shape_cast %263 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %266 = vector.shape_cast %261 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %267 = amdgpu.mfma %264 * %265 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %268 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %269 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %270 = vector.shape_cast %268 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %271 = vector.shape_cast %269 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %272 = amdgpu.mfma %270 * %271 + %267 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %273 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %274 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %275 = vector.shape_cast %273 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %276 = vector.shape_cast %274 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %277 = amdgpu.mfma %275 * %276 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %278 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %279 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %280 = vector.shape_cast %278 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %281 = vector.shape_cast %279 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %282 = amdgpu.mfma %280 * %281 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %283 = vector.shape_cast %282 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %284 = vector.insert %283, %260 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %285 = vector.extract %188[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %286 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %287 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %288 = vector.shape_cast %286 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %289 = vector.shape_cast %287 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %290 = vector.shape_cast %285 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %291 = amdgpu.mfma %288 * %289 + %290 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %292 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %293 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %294 = vector.shape_cast %292 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %295 = vector.shape_cast %293 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %296 = amdgpu.mfma %294 * %295 + %291 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %297 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %298 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %299 = vector.shape_cast %297 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %300 = vector.shape_cast %298 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %301 = amdgpu.mfma %299 * %300 + %296 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %302 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %303 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %304 = vector.shape_cast %302 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %305 = vector.shape_cast %303 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %306 = amdgpu.mfma %304 * %305 + %301 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %307 = vector.shape_cast %306 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %308 = vector.insert %307, %284 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %309 = vector.extract %188[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %310 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %311 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %312 = vector.shape_cast %310 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %313 = vector.shape_cast %311 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %314 = vector.shape_cast %309 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %315 = amdgpu.mfma %312 * %313 + %314 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %316 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %317 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %318 = vector.shape_cast %316 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %319 = vector.shape_cast %317 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %320 = amdgpu.mfma %318 * %319 + %315 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %321 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %322 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %323 = vector.shape_cast %321 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %324 = vector.shape_cast %322 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %325 = amdgpu.mfma %323 * %324 + %320 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %326 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %327 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %328 = vector.shape_cast %326 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %329 = vector.shape_cast %327 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %330 = amdgpu.mfma %328 * %329 + %325 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %331 = vector.shape_cast %330 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %332 = vector.insert %331, %308 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %333 = vector.extract %188[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %334 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %335 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %336 = vector.shape_cast %334 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %337 = vector.shape_cast %335 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %338 = vector.shape_cast %333 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %339 = amdgpu.mfma %336 * %337 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %340 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %341 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %342 = vector.shape_cast %340 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %343 = vector.shape_cast %341 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %344 = amdgpu.mfma %342 * %343 + %339 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %345 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %346 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %347 = vector.shape_cast %345 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %348 = vector.shape_cast %346 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %349 = amdgpu.mfma %347 * %348 + %344 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %350 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %351 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %352 = vector.shape_cast %350 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %353 = vector.shape_cast %351 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %354 = amdgpu.mfma %352 * %353 + %349 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %355 = vector.shape_cast %354 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %356 = vector.insert %355, %332 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %357 = vector.extract %188[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %358 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %359 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %360 = vector.shape_cast %358 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %361 = vector.shape_cast %359 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %362 = vector.shape_cast %357 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %363 = amdgpu.mfma %360 * %361 + %362 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %364 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %365 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %366 = vector.shape_cast %364 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %367 = vector.shape_cast %365 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %368 = amdgpu.mfma %366 * %367 + %363 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %369 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %370 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %371 = vector.shape_cast %369 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %372 = vector.shape_cast %370 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %373 = amdgpu.mfma %371 * %372 + %368 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %374 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %375 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %376 = vector.shape_cast %374 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %377 = vector.shape_cast %375 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %378 = amdgpu.mfma %376 * %377 + %373 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %379 = vector.shape_cast %378 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %380 = vector.insert %379, %356 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %381 = arith.truncf %380 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
    scf.yield %381 : vector<1x2x4x1x1x1x1x1x4xf16>
  } {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
  %subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %8:7 = affine.delinearize_index %0 into (%c1, %c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index, index
  %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
  %10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#2, %8#6]
  %11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %13 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
  %14 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#2, %8#6]
  %15 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %16 = vector.transpose %15, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %16, %subview_5[%c0, %13, %14] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %17 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
  %18 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#2, %8#6]
  %19 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %20 = vector.transpose %19, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %20, %subview_5[%c0, %17, %18] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %21 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#1, %8#5]
  %22 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#2, %8#6]
  %23 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %24, %subview_5[%c0, %21, %22] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %25 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
  %26 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#2, %8#6]
  %27 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %28, %subview_5[%c0, %25, %26] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %29 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
  %30 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#2, %8#6]
  %31 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %32 = vector.transpose %31, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %32, %subview_5[%c0, %29, %30] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %33 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
  %34 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#2, %8#6]
  %35 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %36 = vector.transpose %35, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %36, %subview_5[%c0, %33, %34] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %37 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#1, %8#5]
  %38 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#2, %8#6]
  %39 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %40 = vector.transpose %39, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %40, %subview_5[%c0, %37, %38] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
    %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
    %c4 = arith.constant 4 : index
    %c2 = arith.constant 2 : index
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c32 = arith.constant 32 : index
    %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_2 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
      %41 = affine.delinearize_index %arg0 into (%c20) : index
      %42 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%41]
      %subview_6 = memref.subview %subview[0, 0, %42] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %43:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
      %44 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#1]
      %45 = vector.transfer_read %subview_6[%c0, %43#0, %44], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %46 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%43#0]
      %47 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%43#1]
      %48 = vector.transfer_read %subview_6[%c0, %46, %47], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %49:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
      %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%49#0, %41]
      %51 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
      %52 = vector.transfer_read %subview_4[%c0, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%49#0, %41]
      %54 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
      %55 = vector.transfer_read %subview_4[%c0, %53, %54], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%49#0, %41]
      %57 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
      %58 = vector.transfer_read %subview_4[%c0, %56, %57], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%49#0, %41]
      %60 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%49#1]
      %61 = vector.transfer_read %subview_4[%c0, %59, %60], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      gpu.barrier
      %62:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
      %63 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#1]
      vector.transfer_write %45, %alloc_3[%c0, %62#0, %63] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      %64 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%62#0]
      %65 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%62#1]
      vector.transfer_write %48, %alloc_3[%c0, %64, %65] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %66:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
      %67 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
      vector.transfer_write %52, %alloc[%c0, %66#0, %67] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %68 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%66#0]
      %69 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
      vector.transfer_write %55, %alloc[%c0, %68, %69] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %70 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%66#0]
      %71 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
      vector.transfer_write %58, %alloc[%c0, %70, %71] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %72 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%66#0]
      %73 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%66#1]
      vector.transfer_write %61, %alloc[%c0, %72, %73] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %74:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
      %76 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#4]
      %77 = vector.transfer_read %alloc_3[%c0, %75, %76], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %78 = vector.insert_strided_slice %77, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %79 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
      %80 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#4]
      %81 = vector.transfer_read %alloc_3[%c0, %79, %80], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %82 = vector.insert_strided_slice %81, %78 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %83 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
      %84 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#4]
      %85 = vector.transfer_read %alloc_3[%c0, %83, %84], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %87 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%74#0, %74#5]
      %88 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#4]
      %89 = vector.transfer_read %alloc_3[%c0, %87, %88], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %91 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
      %92 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%74#4]
      %93 = vector.transfer_read %alloc_3[%c0, %91, %92], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %94 = vector.insert_strided_slice %93, %90 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %95 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
      %96 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%74#4]
      %97 = vector.transfer_read %alloc_3[%c0, %95, %96], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %98 = vector.insert_strided_slice %97, %94 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %99 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
      %100 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%74#4]
      %101 = vector.transfer_read %alloc_3[%c0, %99, %100], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %102 = vector.insert_strided_slice %101, %98 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %103 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%74#0, %74#5]
      %104 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%74#4]
      %105 = vector.transfer_read %alloc_3[%c0, %103, %104], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %106 = vector.insert_strided_slice %105, %102 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %107:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
      %108 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
      %109 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
      %110 = vector.transfer_read %alloc[%c0, %108, %109], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %112 = vector.insert_strided_slice %111, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %113 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
      %114 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
      %115 = vector.transfer_read %alloc[%c0, %113, %114], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %116 = vector.transpose %115, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %117 = vector.insert_strided_slice %116, %112 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %118 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
      %119 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
      %120 = vector.transfer_read %alloc[%c0, %118, %119], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %122 = vector.insert_strided_slice %121, %117 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %123 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%107#4]
      %124 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
      %125 = vector.transfer_read %alloc[%c0, %123, %124], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %127 = vector.insert_strided_slice %126, %122 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %128 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
      %129 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
      %130 = vector.transfer_read %alloc[%c0, %128, %129], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %131 = vector.transpose %130, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %132 = vector.insert_strided_slice %131, %127 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %133 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
      %134 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
      %135 = vector.transfer_read %alloc[%c0, %133, %134], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %136 = vector.transpose %135, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %137 = vector.insert_strided_slice %136, %132 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %138 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
      %139 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
      %140 = vector.transfer_read %alloc[%c0, %138, %139], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %141 = vector.transpose %140, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %142 = vector.insert_strided_slice %141, %137 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %143 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%107#4]
      %144 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
      %145 = vector.transfer_read %alloc[%c0, %143, %144], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %146 = vector.transpose %145, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %147 = vector.insert_strided_slice %146, %142 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %148 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
      %149 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
      %150 = vector.transfer_read %alloc[%c0, %148, %149], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %151 = vector.transpose %150, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %152 = vector.insert_strided_slice %151, %147 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %153 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
      %154 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
      %155 = vector.transfer_read %alloc[%c0, %153, %154], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %156 = vector.transpose %155, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %157 = vector.insert_strided_slice %156, %152 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %158 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
      %159 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
      %160 = vector.transfer_read %alloc[%c0, %158, %159], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %161 = vector.transpose %160, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %162 = vector.insert_strided_slice %161, %157 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %163 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%107#4]
      %164 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
      %165 = vector.transfer_read %alloc[%c0, %163, %164], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %166 = vector.transpose %165, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %167 = vector.insert_strided_slice %166, %162 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %168 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
      %169 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%107#1, %107#5]
      %170 = vector.transfer_read %alloc[%c0, %168, %169], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %171 = vector.transpose %170, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %172 = vector.insert_strided_slice %171, %167 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %173 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
      %174 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%107#1, %107#5]
      %175 = vector.transfer_read %alloc[%c0, %173, %174], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %176 = vector.transpose %175, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %177 = vector.insert_strided_slice %176, %172 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %178 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
      %179 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%107#1, %107#5]
      %180 = vector.transfer_read %alloc[%c0, %178, %179], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %181 = vector.transpose %180, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %182 = vector.insert_strided_slice %181, %177 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %183 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%107#4]
      %184 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%107#1, %107#5]
      %185 = vector.transfer_read %alloc[%c0, %183, %184], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %186 = vector.transpose %185, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %187 = vector.insert_strided_slice %186, %182 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %188 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
      %189 = vector.extract %188[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %190 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %191 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %192 = vector.shape_cast %190 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %193 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %194 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %195 = amdgpu.mfma %192 * %193 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %196 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %197 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %198 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %199 = vector.shape_cast %197 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %201 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %202 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %203 = vector.shape_cast %201 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %204 = vector.shape_cast %202 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %206 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %207 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %208 = vector.shape_cast %206 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %209 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %211 = vector.shape_cast %210 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %212 = vector.insert %211, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %213 = vector.extract %188[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %214 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %215 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %216 = vector.shape_cast %214 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %217 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %218 = vector.shape_cast %213 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %219 = amdgpu.mfma %216 * %217 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %220 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %221 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %222 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %223 = vector.shape_cast %221 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %224 = amdgpu.mfma %222 * %223 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %225 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %226 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %227 = vector.shape_cast %225 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %228 = vector.shape_cast %226 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %229 = amdgpu.mfma %227 * %228 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %230 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %231 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %232 = vector.shape_cast %230 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %233 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %234 = amdgpu.mfma %232 * %233 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %235 = vector.shape_cast %234 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %236 = vector.insert %235, %212 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %237 = vector.extract %188[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %238 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %239 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %240 = vector.shape_cast %238 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %241 = vector.shape_cast %239 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %242 = vector.shape_cast %237 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %243 = amdgpu.mfma %240 * %241 + %242 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %244 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %245 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %246 = vector.shape_cast %244 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %247 = vector.shape_cast %245 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %248 = amdgpu.mfma %246 * %247 + %243 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %249 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %250 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %251 = vector.shape_cast %249 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %252 = vector.shape_cast %250 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %253 = amdgpu.mfma %251 * %252 + %248 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %254 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %255 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %256 = vector.shape_cast %254 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %257 = vector.shape_cast %255 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %258 = amdgpu.mfma %256 * %257 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %259 = vector.shape_cast %258 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %260 = vector.insert %259, %236 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %261 = vector.extract %188[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %262 = vector.extract %106[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %263 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %264 = vector.shape_cast %262 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %265 = vector.shape_cast %263 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %266 = vector.shape_cast %261 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %267 = amdgpu.mfma %264 * %265 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %268 = vector.extract %106[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %269 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %270 = vector.shape_cast %268 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %271 = vector.shape_cast %269 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %272 = amdgpu.mfma %270 * %271 + %267 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %273 = vector.extract %106[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %274 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %275 = vector.shape_cast %273 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %276 = vector.shape_cast %274 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %277 = amdgpu.mfma %275 * %276 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %278 = vector.extract %106[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %279 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %280 = vector.shape_cast %278 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %281 = vector.shape_cast %279 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %282 = amdgpu.mfma %280 * %281 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %283 = vector.shape_cast %282 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %284 = vector.insert %283, %260 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %285 = vector.extract %188[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %286 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %287 = vector.extract %187[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %288 = vector.shape_cast %286 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %289 = vector.shape_cast %287 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %290 = vector.shape_cast %285 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %291 = amdgpu.mfma %288 * %289 + %290 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %292 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %293 = vector.extract %187[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %294 = vector.shape_cast %292 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %295 = vector.shape_cast %293 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %296 = amdgpu.mfma %294 * %295 + %291 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %297 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %298 = vector.extract %187[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %299 = vector.shape_cast %297 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %300 = vector.shape_cast %298 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %301 = amdgpu.mfma %299 * %300 + %296 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %302 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %303 = vector.extract %187[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %304 = vector.shape_cast %302 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %305 = vector.shape_cast %303 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %306 = amdgpu.mfma %304 * %305 + %301 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %307 = vector.shape_cast %306 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %308 = vector.insert %307, %284 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %309 = vector.extract %188[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %310 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %311 = vector.extract %187[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %312 = vector.shape_cast %310 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %313 = vector.shape_cast %311 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %314 = vector.shape_cast %309 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %315 = amdgpu.mfma %312 * %313 + %314 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %316 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %317 = vector.extract %187[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %318 = vector.shape_cast %316 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %319 = vector.shape_cast %317 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %320 = amdgpu.mfma %318 * %319 + %315 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %321 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %322 = vector.extract %187[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %323 = vector.shape_cast %321 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %324 = vector.shape_cast %322 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %325 = amdgpu.mfma %323 * %324 + %320 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %326 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %327 = vector.extract %187[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %328 = vector.shape_cast %326 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %329 = vector.shape_cast %327 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %330 = amdgpu.mfma %328 * %329 + %325 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %331 = vector.shape_cast %330 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %332 = vector.insert %331, %308 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %333 = vector.extract %188[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %334 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %335 = vector.extract %187[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %336 = vector.shape_cast %334 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %337 = vector.shape_cast %335 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %338 = vector.shape_cast %333 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %339 = amdgpu.mfma %336 * %337 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %340 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %341 = vector.extract %187[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %342 = vector.shape_cast %340 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %343 = vector.shape_cast %341 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %344 = amdgpu.mfma %342 * %343 + %339 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %345 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %346 = vector.extract %187[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %347 = vector.shape_cast %345 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %348 = vector.shape_cast %346 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %349 = amdgpu.mfma %347 * %348 + %344 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %350 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %351 = vector.extract %187[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %352 = vector.shape_cast %350 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %353 = vector.shape_cast %351 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %354 = amdgpu.mfma %352 * %353 + %349 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %355 = vector.shape_cast %354 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %356 = vector.insert %355, %332 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %357 = vector.extract %188[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %358 = vector.extract %106[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %359 = vector.extract %187[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %360 = vector.shape_cast %358 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %361 = vector.shape_cast %359 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %362 = vector.shape_cast %357 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %363 = amdgpu.mfma %360 * %361 + %362 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %364 = vector.extract %106[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %365 = vector.extract %187[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %366 = vector.shape_cast %364 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %367 = vector.shape_cast %365 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %368 = amdgpu.mfma %366 * %367 + %363 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %369 = vector.extract %106[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %370 = vector.extract %187[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %371 = vector.shape_cast %369 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %372 = vector.shape_cast %370 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %373 = amdgpu.mfma %371 * %372 + %368 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %374 = vector.extract %106[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %375 = vector.extract %187[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %376 = vector.shape_cast %374 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %377 = vector.shape_cast %375 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %378 = amdgpu.mfma %376 * %377 + %373 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %379 = vector.shape_cast %378 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %380 = vector.insert %379, %356 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %381 = arith.truncf %380 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
      scf.yield %381 : vector<1x2x4x1x1x1x1x1x4xf16>
    } {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
    %subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
    %10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
    %11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %13 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
    %14 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
    %15 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %16 = vector.transpose %15, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %16, %subview_5[%c0, %13, %14] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %17 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
    %18 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
    %19 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %20 = vector.transpose %19, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %20, %subview_5[%c0, %17, %18] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %21 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
    %22 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
    %23 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %24, %subview_5[%c0, %21, %22] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %25 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
    %26 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
    %27 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %28, %subview_5[%c0, %25, %26] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %29 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
    %30 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
    %31 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %32 = vector.transpose %31, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %32, %subview_5[%c0, %29, %30] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %33 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
    %34 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
    %35 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %36 = vector.transpose %35, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %36, %subview_5[%c0, %33, %34] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %37 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
    %38 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
    %39 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %40 = vector.transpose %39, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %40, %subview_5[%c0, %37, %38] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
    %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
    %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
    %c4 = arith.constant 4 : index
    %c2 = arith.constant 2 : index
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c32 = arith.constant 32 : index
    %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
    %c20 = arith.constant 20 : index
    %c1 = arith.constant 1 : index
    %cst_2 = arith.constant 0.000000e+00 : f16
    %c0 = arith.constant 0 : index
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
    %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
    %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
    %subview = memref.subview %1[%workgroup_id_z, %4, 0] [1, %6, 1280] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_4 = memref.subview %2[%workgroup_id_z, 0, %5] [1, 1280, 128] [1, 1, 1] : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
      %31 = affine.delinearize_index %arg0 into (%c20) : index
      %32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
      %subview_6 = memref.subview %subview[0, 0, %32] [1, %6, 64] [1, 1, 1] : memref<1x?x1280xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %33:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
      %34 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%33#1]
      %35 = vector.transfer_read %subview_6[%c0, %33#0, %34], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %36 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%33#0]
      %37 = vector.transfer_read %subview_6[%c0, %36, %34], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %38:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
      %39 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%38#0, %31]
      %40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%38#1]
      %41 = vector.transfer_read %subview_4[%c0, %39, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %42 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%38#0, %31]
      %43 = vector.transfer_read %subview_4[%c0, %42, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%38#0, %31]
      %45 = vector.transfer_read %subview_4[%c0, %44, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      %46 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%38#0, %31]
      %47 = vector.transfer_read %subview_4[%c0, %46, %40], %cst_2 {in_bounds = [true, true, true]} : memref<1x1280x128xf16, strided<[1638400, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
      gpu.barrier
      vector.transfer_write %35, %alloc_3[%c0, %33#0, %34] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      vector.transfer_write %37, %alloc_3[%c0, %36, %34] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      vector.transfer_write %41, %alloc[%c0, %38#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %48 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%38#0]
      vector.transfer_write %43, %alloc[%c0, %48, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %49 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%38#0]
      vector.transfer_write %45, %alloc[%c0, %49, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      %50 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%38#0]
      vector.transfer_write %47, %alloc[%c0, %50, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
      gpu.barrier
      %51:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
      %52 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%51#0, %51#5]
      %53 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%51#4]
      %54 = vector.transfer_read %alloc_3[%c0, %52, %53], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %55 = vector.insert_strided_slice %54, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %56 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%51#4]
      %57 = vector.transfer_read %alloc_3[%c0, %52, %56], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %58 = vector.insert_strided_slice %57, %55 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %59 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%51#4]
      %60 = vector.transfer_read %alloc_3[%c0, %52, %59], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %61 = vector.insert_strided_slice %60, %58 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %62 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%51#4]
      %63 = vector.transfer_read %alloc_3[%c0, %52, %62], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %64 = vector.insert_strided_slice %63, %61 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %65 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%51#0, %51#5]
      %66 = vector.transfer_read %alloc_3[%c0, %65, %53], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %67 = vector.insert_strided_slice %66, %64 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %68 = vector.transfer_read %alloc_3[%c0, %65, %56], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %69 = vector.insert_strided_slice %68, %67 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %70 = vector.transfer_read %alloc_3[%c0, %65, %59], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %71 = vector.insert_strided_slice %70, %69 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %72 = vector.transfer_read %alloc_3[%c0, %65, %62], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
      %73 = vector.insert_strided_slice %72, %71 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
      %74 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%51#1, %51#5]
      %75 = vector.transfer_read %alloc[%c0, %53, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %76 = vector.transpose %75, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %77 = vector.insert_strided_slice %76, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %78 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%51#1, %51#5]
      %79 = vector.transfer_read %alloc[%c0, %53, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %80 = vector.transpose %79, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %81 = vector.insert_strided_slice %80, %77 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %82 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%51#1, %51#5]
      %83 = vector.transfer_read %alloc[%c0, %53, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %84 = vector.transpose %83, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %85 = vector.insert_strided_slice %84, %81 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %86 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%51#1, %51#5]
      %87 = vector.transfer_read %alloc[%c0, %53, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %88 = vector.transpose %87, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %89 = vector.insert_strided_slice %88, %85 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %90 = vector.transfer_read %alloc[%c0, %56, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %91 = vector.transpose %90, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %92 = vector.insert_strided_slice %91, %89 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %93 = vector.transfer_read %alloc[%c0, %56, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %94 = vector.transpose %93, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %95 = vector.insert_strided_slice %94, %92 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %96 = vector.transfer_read %alloc[%c0, %56, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %97 = vector.transpose %96, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %98 = vector.insert_strided_slice %97, %95 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %99 = vector.transfer_read %alloc[%c0, %56, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %100 = vector.transpose %99, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %101 = vector.insert_strided_slice %100, %98 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %102 = vector.transfer_read %alloc[%c0, %59, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %103 = vector.transpose %102, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %104 = vector.insert_strided_slice %103, %101 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %105 = vector.transfer_read %alloc[%c0, %59, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %106 = vector.transpose %105, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %107 = vector.insert_strided_slice %106, %104 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %108 = vector.transfer_read %alloc[%c0, %59, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %109 = vector.transpose %108, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %110 = vector.insert_strided_slice %109, %107 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %111 = vector.transfer_read %alloc[%c0, %59, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %112 = vector.transpose %111, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %113 = vector.insert_strided_slice %112, %110 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %114 = vector.transfer_read %alloc[%c0, %62, %74], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %115 = vector.transpose %114, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %116 = vector.insert_strided_slice %115, %113 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %117 = vector.transfer_read %alloc[%c0, %62, %78], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %118 = vector.transpose %117, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %119 = vector.insert_strided_slice %118, %116 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %120 = vector.transfer_read %alloc[%c0, %62, %82], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %121 = vector.transpose %120, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %122 = vector.insert_strided_slice %121, %119 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %123 = vector.transfer_read %alloc[%c0, %62, %86], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
      %124 = vector.transpose %123, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
      %125 = vector.insert_strided_slice %124, %122 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
      %126 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
      %127 = vector.extract %126[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %128 = vector.extract %73[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %129 = vector.extract %125[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %130 = vector.shape_cast %128 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %131 = vector.shape_cast %129 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %132 = vector.shape_cast %127 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %133 = amdgpu.mfma %130 * %131 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %134 = vector.extract %73[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %135 = vector.extract %125[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %137 = vector.shape_cast %135 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %138 = amdgpu.mfma %136 * %137 + %133 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %139 = vector.extract %73[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %140 = vector.extract %125[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %144 = vector.extract %73[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %145 = vector.extract %125[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %149 = vector.shape_cast %148 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %150 = vector.insert %149, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %151 = vector.extract %126[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %152 = vector.extract %125[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %153 = vector.shape_cast %152 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %154 = vector.shape_cast %151 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %155 = amdgpu.mfma %130 * %153 + %154 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %156 = vector.extract %125[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %157 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %158 = amdgpu.mfma %136 * %157 + %155 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %159 = vector.extract %125[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %160 = vector.shape_cast %159 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %161 = amdgpu.mfma %141 * %160 + %158 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %162 = vector.extract %125[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %163 = vector.shape_cast %162 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %164 = amdgpu.mfma %146 * %163 + %161 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %165 = vector.shape_cast %164 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %166 = vector.insert %165, %150 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %167 = vector.extract %126[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %168 = vector.extract %125[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %169 = vector.shape_cast %168 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %170 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %171 = amdgpu.mfma %130 * %169 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %172 = vector.extract %125[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %173 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %174 = amdgpu.mfma %136 * %173 + %171 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %175 = vector.extract %125[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %176 = vector.shape_cast %175 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %177 = amdgpu.mfma %141 * %176 + %174 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %178 = vector.extract %125[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %179 = vector.shape_cast %178 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %180 = amdgpu.mfma %146 * %179 + %177 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %181 = vector.shape_cast %180 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %182 = vector.insert %181, %166 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %183 = vector.extract %126[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %184 = vector.extract %125[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %185 = vector.shape_cast %184 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %186 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %187 = amdgpu.mfma %130 * %185 + %186 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %188 = vector.extract %125[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %189 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %190 = amdgpu.mfma %136 * %189 + %187 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %191 = vector.extract %125[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %192 = vector.shape_cast %191 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %193 = amdgpu.mfma %141 * %192 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %194 = vector.extract %125[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
      %195 = vector.shape_cast %194 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %196 = amdgpu.mfma %146 * %195 + %193 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %197 = vector.shape_cast %196 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %198 = vector.insert %197, %182 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %199 = vector.extract %126[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %200 = vector.extract %73[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %201 = vector.shape_cast %200 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %202 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %203 = amdgpu.mfma %201 * %131 + %202 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %204 = vector.extract %73[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %205 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %206 = amdgpu.mfma %205 * %137 + %203 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %207 = vector.extract %73[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %208 = vector.shape_cast %207 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %209 = amdgpu.mfma %208 * %142 + %206 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %210 = vector.extract %73[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
      %211 = vector.shape_cast %210 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
      %212 = amdgpu.mfma %211 * %147 + %209 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %213 = vector.shape_cast %212 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %214 = vector.insert %213, %198 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %215 = vector.extract %126[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %217 = amdgpu.mfma %201 * %153 + %216 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %218 = amdgpu.mfma %205 * %157 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %219 = amdgpu.mfma %208 * %160 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %220 = amdgpu.mfma %211 * %163 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %221 = vector.shape_cast %220 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %222 = vector.insert %221, %214 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %223 = vector.extract %126[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %224 = vector.shape_cast %223 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %225 = amdgpu.mfma %201 * %169 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %226 = amdgpu.mfma %205 * %173 + %225 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %227 = amdgpu.mfma %208 * %176 + %226 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %228 = amdgpu.mfma %211 * %179 + %227 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %229 = vector.shape_cast %228 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %230 = vector.insert %229, %222 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %231 = vector.extract %126[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
      %232 = vector.shape_cast %231 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
      %233 = amdgpu.mfma %201 * %185 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %234 = amdgpu.mfma %205 * %189 + %233 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %235 = amdgpu.mfma %208 * %192 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %236 = amdgpu.mfma %211 * %195 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %237 = vector.shape_cast %236 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
      %238 = vector.insert %237, %230 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
      %239 = arith.truncf %238 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
      scf.yield %239 : vector<1x2x4x1x1x1x1x1x4xf16>
    } {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
    %subview_5 = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
    %10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
    %11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %12, %subview_5[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
    %14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %15, %subview_5[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
    %17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %18, %subview_5[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
    %20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %21, %subview_5[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
    %23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %24, %subview_5[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %26, %subview_5[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %28, %subview_5[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
    vector.transfer_write %30, %subview_5[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
    memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
    return
  }
}

// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
  %c4 = arith.constant 4 : index
  %c2 = arith.constant 2 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_2 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
    %31 = affine.delinearize_index %arg0 into (%c20) : index
    %32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
    %subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
    %35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
    %36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
    %38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
    %40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
    %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
    %42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
    %45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
    %48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    gpu.barrier
    vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
    vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
    vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
    vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
    %57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
    %58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
    %59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
    %62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
    %65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
    %68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
    %71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
    %80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
    %84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
    %88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
    %92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %98 = vector.transfer_read %alloc[%c0, %61, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %99 = vector.transpose %98, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %100 = vector.insert_strided_slice %99, %97 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %101 = vector.transfer_read %alloc[%c0, %61, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %102 = vector.transpose %101, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %103 = vector.insert_strided_slice %102, %100 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %104 = vector.transfer_read %alloc[%c0, %61, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %105 = vector.transpose %104, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %106 = vector.insert_strided_slice %105, %103 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %107 = vector.transfer_read %alloc[%c0, %64, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %108 = vector.transpose %107, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %109 = vector.insert_strided_slice %108, %106 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %110 = vector.transfer_read %alloc[%c0, %64, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %112 = vector.insert_strided_slice %111, %109 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %113 = vector.transfer_read %alloc[%c0, %64, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %114 = vector.transpose %113, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %115 = vector.insert_strided_slice %114, %112 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %116 = vector.transfer_read %alloc[%c0, %64, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %117 = vector.transpose %116, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %118 = vector.insert_strided_slice %117, %115 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %119 = vector.transfer_read %alloc[%c0, %67, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %120 = vector.transpose %119, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %121 = vector.insert_strided_slice %120, %118 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %122 = vector.transfer_read %alloc[%c0, %67, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %123 = vector.transpose %122, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %124 = vector.insert_strided_slice %123, %121 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %125 = vector.transfer_read %alloc[%c0, %67, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %127 = vector.insert_strided_slice %126, %124 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %128 = vector.transfer_read %alloc[%c0, %67, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %129 = vector.transpose %128, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %130 = vector.insert_strided_slice %129, %127 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %131 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
    %132 = vector.extract %131[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %133 = vector.extract %78[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %134 = vector.extract %130[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %135 = vector.shape_cast %133 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %137 = vector.shape_cast %132 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %138 = amdgpu.mfma %135 * %136 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %139 = vector.extract %78[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %140 = vector.extract %130[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %144 = vector.extract %78[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %145 = vector.extract %130[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %149 = vector.extract %78[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %150 = vector.extract %130[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %151 = vector.shape_cast %149 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %152 = vector.shape_cast %150 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %153 = amdgpu.mfma %151 * %152 + %148 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %154 = vector.shape_cast %153 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %155 = vector.insert %154, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %156 = vector.extract %131[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %157 = vector.extract %130[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %158 = vector.shape_cast %157 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %159 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %160 = amdgpu.mfma %135 * %158 + %159 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %161 = vector.extract %130[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %162 = vector.shape_cast %161 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %163 = amdgpu.mfma %141 * %162 + %160 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %164 = vector.extract %130[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %165 = vector.shape_cast %164 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %166 = amdgpu.mfma %146 * %165 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %167 = vector.extract %130[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %168 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %169 = amdgpu.mfma %151 * %168 + %166 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %170 = vector.shape_cast %169 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %171 = vector.insert %170, %155 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %172 = vector.extract %131[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %173 = vector.extract %130[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %174 = vector.shape_cast %173 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %175 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %176 = amdgpu.mfma %135 * %174 + %175 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %177 = vector.extract %130[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %178 = vector.shape_cast %177 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %179 = amdgpu.mfma %141 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %180 = vector.extract %130[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %181 = vector.shape_cast %180 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %182 = amdgpu.mfma %146 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %183 = vector.extract %130[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %184 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %185 = amdgpu.mfma %151 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %187 = vector.insert %186, %171 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %188 = vector.extract %131[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %189 = vector.extract %130[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %190 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %191 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %192 = amdgpu.mfma %135 * %190 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %193 = vector.extract %130[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %194 = vector.shape_cast %193 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %195 = amdgpu.mfma %141 * %194 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %196 = vector.extract %130[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %197 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %198 = amdgpu.mfma %146 * %197 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %199 = vector.extract %130[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %200 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %201 = amdgpu.mfma %151 * %200 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %202 = vector.shape_cast %201 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %203 = vector.insert %202, %187 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %204 = vector.extract %131[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %205 = vector.extract %78[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %206 = vector.shape_cast %205 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %207 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %208 = amdgpu.mfma %206 * %136 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %209 = vector.extract %78[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %210 = vector.shape_cast %209 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %211 = amdgpu.mfma %210 * %142 + %208 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %212 = vector.extract %78[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %213 = vector.shape_cast %212 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %214 = amdgpu.mfma %213 * %147 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %215 = vector.extract %78[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %217 = amdgpu.mfma %216 * %152 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %218 = vector.shape_cast %217 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %219 = vector.insert %218, %203 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %220 = vector.extract %131[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %221 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %222 = amdgpu.mfma %206 * %158 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %223 = amdgpu.mfma %210 * %162 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %224 = amdgpu.mfma %213 * %165 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %225 = amdgpu.mfma %216 * %168 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %227 = vector.insert %226, %219 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %228 = vector.extract %131[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %229 = vector.shape_cast %228 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %230 = amdgpu.mfma %206 * %174 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %231 = amdgpu.mfma %210 * %178 + %230 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %232 = amdgpu.mfma %213 * %181 + %231 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %233 = amdgpu.mfma %216 * %184 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %234 = vector.shape_cast %233 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %235 = vector.insert %234, %227 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %236 = vector.extract %131[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %237 = vector.shape_cast %236 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %238 = amdgpu.mfma %206 * %190 + %237 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %239 = amdgpu.mfma %210 * %194 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %240 = amdgpu.mfma %213 * %197 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %241 = amdgpu.mfma %216 * %200 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %242 = vector.shape_cast %241 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %243 = vector.insert %242, %235 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %244 = arith.truncf %243 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
    scf.yield %244 : vector<1x2x4x1x1x1x1x1x4xf16>
  } {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
  %subview = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
  %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
  %10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
  %11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %12, %subview[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
  %14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %15, %subview[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
  %17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %18, %subview[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
  %20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %21, %subview[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
  %23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %24, %subview[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %26, %subview[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %28, %subview[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %30, %subview[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
  %c4 = arith.constant 4 : index
  %c2 = arith.constant 2 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_2 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
    %31 = affine.delinearize_index %arg0 into (%c20) : index
    %32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
    %subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
    %35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
    %36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
    %38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
    %40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
    %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
    %42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
    %45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
    %48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    gpu.barrier
    vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
    vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
    vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
    vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
    %57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
    %58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
    %59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
    %62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
    %65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
    %68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
    %71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
    %80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
    %84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
    %88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
    %92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %98 = vector.transfer_read %alloc[%c0, %61, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %99 = vector.transpose %98, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %100 = vector.insert_strided_slice %99, %97 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %101 = vector.transfer_read %alloc[%c0, %61, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %102 = vector.transpose %101, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %103 = vector.insert_strided_slice %102, %100 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %104 = vector.transfer_read %alloc[%c0, %61, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %105 = vector.transpose %104, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %106 = vector.insert_strided_slice %105, %103 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %107 = vector.transfer_read %alloc[%c0, %64, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %108 = vector.transpose %107, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %109 = vector.insert_strided_slice %108, %106 {offsets = [0, 2, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %110 = vector.transfer_read %alloc[%c0, %64, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %111 = vector.transpose %110, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %112 = vector.insert_strided_slice %111, %109 {offsets = [0, 2, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %113 = vector.transfer_read %alloc[%c0, %64, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %114 = vector.transpose %113, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %115 = vector.insert_strided_slice %114, %112 {offsets = [0, 2, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %116 = vector.transfer_read %alloc[%c0, %64, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %117 = vector.transpose %116, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %118 = vector.insert_strided_slice %117, %115 {offsets = [0, 2, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %119 = vector.transfer_read %alloc[%c0, %67, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %120 = vector.transpose %119, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %121 = vector.insert_strided_slice %120, %118 {offsets = [0, 3, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %122 = vector.transfer_read %alloc[%c0, %67, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %123 = vector.transpose %122, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %124 = vector.insert_strided_slice %123, %121 {offsets = [0, 3, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %125 = vector.transfer_read %alloc[%c0, %67, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %126 = vector.transpose %125, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %127 = vector.insert_strided_slice %126, %124 {offsets = [0, 3, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %128 = vector.transfer_read %alloc[%c0, %67, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %129 = vector.transpose %128, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %130 = vector.insert_strided_slice %129, %127 {offsets = [0, 3, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %131 = arith.extf %arg1 : vector<1x2x4x1x1x1x1x1x4xf16> to vector<1x2x4x1x1x1x1x1x4xf32>
    %132 = vector.extract %131[0, 0, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %133 = vector.extract %78[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %134 = vector.extract %130[0, 0, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %135 = vector.shape_cast %133 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %136 = vector.shape_cast %134 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %137 = vector.shape_cast %132 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %138 = amdgpu.mfma %135 * %136 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %139 = vector.extract %78[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %140 = vector.extract %130[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %141 = vector.shape_cast %139 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %142 = vector.shape_cast %140 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %143 = amdgpu.mfma %141 * %142 + %138 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %144 = vector.extract %78[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %145 = vector.extract %130[0, 2, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %146 = vector.shape_cast %144 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %147 = vector.shape_cast %145 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %148 = amdgpu.mfma %146 * %147 + %143 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %149 = vector.extract %78[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %150 = vector.extract %130[0, 3, 0] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %151 = vector.shape_cast %149 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %152 = vector.shape_cast %150 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %153 = amdgpu.mfma %151 * %152 + %148 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %154 = vector.shape_cast %153 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %155 = vector.insert %154, %cst [0, 0, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %156 = vector.extract %131[0, 0, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %157 = vector.extract %130[0, 0, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %158 = vector.shape_cast %157 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %159 = vector.shape_cast %156 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %160 = amdgpu.mfma %135 * %158 + %159 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %161 = vector.extract %130[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %162 = vector.shape_cast %161 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %163 = amdgpu.mfma %141 * %162 + %160 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %164 = vector.extract %130[0, 2, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %165 = vector.shape_cast %164 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %166 = amdgpu.mfma %146 * %165 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %167 = vector.extract %130[0, 3, 1] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %168 = vector.shape_cast %167 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %169 = amdgpu.mfma %151 * %168 + %166 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %170 = vector.shape_cast %169 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %171 = vector.insert %170, %155 [0, 0, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %172 = vector.extract %131[0, 0, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %173 = vector.extract %130[0, 0, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %174 = vector.shape_cast %173 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %175 = vector.shape_cast %172 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %176 = amdgpu.mfma %135 * %174 + %175 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %177 = vector.extract %130[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %178 = vector.shape_cast %177 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %179 = amdgpu.mfma %141 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %180 = vector.extract %130[0, 2, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %181 = vector.shape_cast %180 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %182 = amdgpu.mfma %146 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %183 = vector.extract %130[0, 3, 2] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %184 = vector.shape_cast %183 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %185 = amdgpu.mfma %151 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %187 = vector.insert %186, %171 [0, 0, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %188 = vector.extract %131[0, 0, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %189 = vector.extract %130[0, 0, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %190 = vector.shape_cast %189 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %191 = vector.shape_cast %188 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %192 = amdgpu.mfma %135 * %190 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %193 = vector.extract %130[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %194 = vector.shape_cast %193 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %195 = amdgpu.mfma %141 * %194 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %196 = vector.extract %130[0, 2, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %197 = vector.shape_cast %196 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %198 = amdgpu.mfma %146 * %197 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %199 = vector.extract %130[0, 3, 3] : vector<1x1x1x1x1x4xf16> from vector<1x4x4x1x1x1x1x1x4xf16>
    %200 = vector.shape_cast %199 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %201 = amdgpu.mfma %151 * %200 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %202 = vector.shape_cast %201 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %203 = vector.insert %202, %187 [0, 0, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %204 = vector.extract %131[0, 1, 0] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %205 = vector.extract %78[0, 1, 0] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %206 = vector.shape_cast %205 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %207 = vector.shape_cast %204 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %208 = amdgpu.mfma %206 * %136 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %209 = vector.extract %78[0, 1, 1] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %210 = vector.shape_cast %209 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %211 = amdgpu.mfma %210 * %142 + %208 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %212 = vector.extract %78[0, 1, 2] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %213 = vector.shape_cast %212 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %214 = amdgpu.mfma %213 * %147 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %215 = vector.extract %78[0, 1, 3] : vector<1x1x1x1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
    %216 = vector.shape_cast %215 : vector<1x1x1x1x1x4xf16> to vector<4xf16>
    %217 = amdgpu.mfma %216 * %152 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %218 = vector.shape_cast %217 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %219 = vector.insert %218, %203 [0, 1, 0] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %220 = vector.extract %131[0, 1, 1] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %221 = vector.shape_cast %220 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %222 = amdgpu.mfma %206 * %158 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %223 = amdgpu.mfma %210 * %162 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %224 = amdgpu.mfma %213 * %165 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %225 = amdgpu.mfma %216 * %168 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %227 = vector.insert %226, %219 [0, 1, 1] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %228 = vector.extract %131[0, 1, 2] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %229 = vector.shape_cast %228 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %230 = amdgpu.mfma %206 * %174 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %231 = amdgpu.mfma %210 * %178 + %230 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %232 = amdgpu.mfma %213 * %181 + %231 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %233 = amdgpu.mfma %216 * %184 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %234 = vector.shape_cast %233 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %235 = vector.insert %234, %227 [0, 1, 2] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %236 = vector.extract %131[0, 1, 3] : vector<1x1x1x1x1x4xf32> from vector<1x2x4x1x1x1x1x1x4xf32>
    %237 = vector.shape_cast %236 : vector<1x1x1x1x1x4xf32> to vector<4xf32>
    %238 = amdgpu.mfma %206 * %190 + %237 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %239 = amdgpu.mfma %210 * %194 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %240 = amdgpu.mfma %213 * %197 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %241 = amdgpu.mfma %216 * %200 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
    %242 = vector.shape_cast %241 : vector<4xf32> to vector<1x1x1x1x1x4xf32>
    %243 = vector.insert %242, %235 [0, 1, 3] : vector<1x1x1x1x1x4xf32> into vector<1x2x4x1x1x1x1x1x4xf32>
    %244 = arith.truncf %243 : vector<1x2x4x1x1x1x1x1x4xf32> to vector<1x2x4x1x1x1x1x1x4xf16>
    scf.yield %244 : vector<1x2x4x1x1x1x1x1x4xf16>
  } {__vector_layout_fetcher_storage = [[unit, unit, unit, #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>], [#iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 2, 2], batches_per_subgroup = [1, 2, 4], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 4, 16], elements_per_thread = [1, 4, 1], element_order = [0, 2, 1], subgroup_basis = [1, 2, 2, 1], subgroup_active_ids= [true, true, true, false], thread_basis = [1, 4, 16]>]]}
  %subview = memref.subview %3[%workgroup_id_z, %4, %5] [1, %6, 128] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %8:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
  %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4)>()[%8#0, %8#4]
  %10 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%8#1, %8#5]
  %11 = vector.extract %7[0, 0, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %12 = vector.transpose %11, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %12, %subview[%c0, %9, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %13 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%8#1, %8#5]
  %14 = vector.extract %7[0, 0, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %15 = vector.transpose %14, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %15, %subview[%c0, %9, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %16 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%8#1, %8#5]
  %17 = vector.extract %7[0, 0, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %18 = vector.transpose %17, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %18, %subview[%c0, %9, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %19 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%8#1, %8#5]
  %20 = vector.extract %7[0, 0, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %21 = vector.transpose %20, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %21, %subview[%c0, %9, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %22 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 * 4 + 16)>()[%8#0, %8#4]
  %23 = vector.extract %7[0, 1, 0, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %24 = vector.transpose %23, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %24, %subview[%c0, %22, %10] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %25 = vector.extract %7[0, 1, 1, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %26 = vector.transpose %25, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %26, %subview[%c0, %22, %13] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %27 = vector.extract %7[0, 1, 2, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %28 = vector.transpose %27, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %28, %subview[%c0, %22, %16] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %29 = vector.extract %7[0, 1, 3, 0, 0, 0] : vector<1x1x4xf16> from vector<1x2x4x1x1x1x1x1x4xf16>
  %30 = vector.transpose %29, [0, 2, 1] : vector<1x1x4xf16> to vector<1x4x1xf16>
  vector.transfer_write %30, %subview[%c0, %22, %19] {in_bounds = [true, false, true]} : vector<1x4x1xf16>, memref<1x?x128xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.dealloc %alloc_3 : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  memref.dealloc %alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  return
}

// -----// IR Dump After GPUDistributeSharedMemoryCopy (iree-codegen-gpu-distribute-shared-memory-copy) //----- //
func.func @batch_matmul_dispatch_0_batch_matmul_64x968x1280x1280_f16() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4x4x1x1x1x1x1x4xf16>
  %c4 = arith.constant 4 : index
  %c2 = arith.constant 2 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x2x4x1x1x1x1x1x4xf16>
  %c20 = arith.constant 20 : index
  %c1 = arith.constant 1 : index
  %cst_2 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %alloc = memref.alloc() : memref<1x64x128xf16, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<1x64x64xf16, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
  %6 = affine.min affine_map<()[s0] -> (s0 * -64 + 968, 64)>()[%workgroup_id_y]
  %7 = scf.for %arg0 = %c0 to %c20 step %c1 iter_args(%arg1 = %cst_1) -> (vector<1x2x4x1x1x1x1x1x4xf16>) {
    %31 = affine.delinearize_index %arg0 into (%c20) : index
    %32 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
    %33 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%31]
    %subview_4 = memref.subview %1[%workgroup_id_z, %32, %33] [1, %6, 64] [1, 1, 1] : memref<64x968x1280xf16, #hal.descriptor_type<storage_buffer>> to memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %34:2 = affine.delinearize_index %0 into (%c32, %c8) : index, index
    %35 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%34#1]
    %36 = vector.transfer_read %subview_4[%c0, %34#0, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %37 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%34#0]
    %38 = vector.transfer_read %subview_4[%c0, %37, %35], %cst_2 {in_bounds = [true, false, true]} : memref<1x?x64xf16, strided<[1239040, 1280, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %39:2 = affine.delinearize_index %0 into (%c16, %c16) : index, index
    %40 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%39#1]
    %41 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64)>()[%39#0, %31]
    %42 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %43 = vector.transfer_read %2[%workgroup_id_z, %41, %42], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %44 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 16)>()[%39#0, %31]
    %45 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %46 = vector.transfer_read %2[%workgroup_id_z, %44, %45], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %47 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 32)>()[%39#0, %31]
    %48 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %49 = vector.transfer_read %2[%workgroup_id_z, %47, %48], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 64 + 48)>()[%39#0, %31]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 8)>()[%workgroup_id_x, %39#1]
    %52 = vector.transfer_read %2[%workgroup_id_z, %50, %51], %cst_2 {in_bounds = [true, true, true]} : memref<64x1280x1280xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x8xf16>
    gpu.barrier
    vector.transfer_write %36, %alloc_3[%c0, %34#0, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    vector.transfer_write %38, %alloc_3[%c0, %37, %35] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x64xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    vector.transfer_write %43, %alloc[%c0, %39#0, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %53 = affine.apply affine_map<()[s0] -> (s0 + 16)>()[%39#0]
    vector.transfer_write %46, %alloc[%c0, %53, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %54 = affine.apply affine_map<()[s0] -> (s0 + 32)>()[%39#0]
    vector.transfer_write %49, %alloc[%c0, %54, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    %55 = affine.apply affine_map<()[s0] -> (s0 + 48)>()[%39#0]
    vector.transfer_write %52, %alloc[%c0, %55, %40] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x64x128xf16, #gpu.address_space<workgroup>>
    gpu.barrier
    %56:6 = affine.delinearize_index %0 into (%c2, %c2, %c1, %c1, %c4, %c16) : index, index, index, index, index, index
    %57 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1)>()[%56#0, %56#5]
    %58 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%56#4]
    %59 = vector.transfer_read %alloc_3[%c0, %57, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %60 = vector.insert_strided_slice %59, %cst_1 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %61 = affine.apply affine_map<()[s0] -> (s0 * 4 + 16)>()[%56#4]
    %62 = vector.transfer_read %alloc_3[%c0, %57, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %63 = vector.insert_strided_slice %62, %60 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %64 = affine.apply affine_map<()[s0] -> (s0 * 4 + 32)>()[%56#4]
    %65 = vector.transfer_read %alloc_3[%c0, %57, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %66 = vector.insert_strided_slice %65, %63 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %67 = affine.apply affine_map<()[s0] -> (s0 * 4 + 48)>()[%56#4]
    %68 = vector.transfer_read %alloc_3[%c0, %57, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %69 = vector.insert_strided_slice %68, %66 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %70 = affine.apply affine_map<()[s0, s1] -> (s0 * 32 + s1 + 16)>()[%56#0, %56#5]
    %71 = vector.transfer_read %alloc_3[%c0, %70, %58], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %72 = vector.insert_strided_slice %71, %69 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %73 = vector.transfer_read %alloc_3[%c0, %70, %61], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %74 = vector.insert_strided_slice %73, %72 {offsets = [0, 1, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %75 = vector.transfer_read %alloc_3[%c0, %70, %64], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %76 = vector.insert_strided_slice %75, %74 {offsets = [0, 1, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %77 = vector.transfer_read %alloc_3[%c0, %70, %67], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x64xf16, #gpu.address_space<workgroup>>, vector<1x1x4xf16>
    %78 = vector.insert_strided_slice %77, %76 {offsets = [0, 1, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x2x4x1x1x1x1x1x4xf16>
    %79 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1)>()[%56#1, %56#5]
    %80 = vector.transfer_read %alloc[%c0, %58, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %81 = vector.transpose %80, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %82 = vector.insert_strided_slice %81, %cst_0 {offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %83 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 16)>()[%56#1, %56#5]
    %84 = vector.transfer_read %alloc[%c0, %58, %83], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %85 = vector.transpose %84, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %86 = vector.insert_strided_slice %85, %82 {offsets = [0, 0, 1, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %87 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 32)>()[%56#1, %56#5]
    %88 = vector.transfer_read %alloc[%c0, %58, %87], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %89 = vector.transpose %88, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %90 = vector.insert_strided_slice %89, %86 {offsets = [0, 0, 2, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %91 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 + 48)>()[%56#1, %56#5]
    %92 = vector.transfer_read %alloc[%c0, %58, %91], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %93 = vector.transpose %92, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %94 = vector.insert_strided_slice %93, %90 {offsets = [0, 0, 3, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %95 = vector.transfer_read %alloc[%c0, %61, %79], %cst_2 {in_bounds = [true, true, true]} : memref<1x64x128xf16, #gpu.address_space<workgroup>>, vector<1x4x1xf16>
    %96 = vector.transpose %95, [0, 2, 1] : vector<1x4x1xf16> to vector<1x1x4xf16>
    %97 = vector.insert_strided_slice %96, %94 {offsets = [0, 1, 0, 0, 0, 0, 0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf16> into vector<1x4x4x1x1x1x1x1x4xf16>
    %98 = vector.transfer_read %alloc[%c0, %61, %83], %