Skip to content

Instantly share code, notes, and snippets.

@hanhanW
Created April 11, 2024 00:59
Show Gist options
  • Save hanhanW/8d4f77c6903ca773c6f60098b5e541b1 to your computer and use it in GitHub Desktop.
Save hanhanW/8d4f77c6903ca773c6f60098b5e541b1 to your computer and use it in GitHub Desktop.
// -----// IR Dump Before SetEncodingHintOnDispatches (iree-flow-set-encoding-hint-on-dispatches) //----- //
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
%9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%10 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%9#1, %1]
%11 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%9#0, %0]
%12 = flow.dispatch.workgroups[%9#1, %9#0, %0, %1, %11, %10](%9#1, %9#0, %2, %0, %1, %11, %10) : (index, index, tensor<?x?xf32>{%0, %1}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%11, %10} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%14 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%13#1, %4]
%15 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%13#0, %3]
%16 = flow.dispatch.workgroups[%13#1, %13#0, %3, %4, %15, %14](%13#1, %13#0, %5, %3, %4, %15, %14) : (index, index, tensor<?x?xf32>{%3, %4}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%15, %14} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%18 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%17#0, %6]
%19 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%17#1, %7]
%20 = flow.dispatch.workgroups[%17#1, %17#0, %6, %7, %18, %19](%17#1, %17#0, %8, %6, %7, %18, %19) : (index, index, tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%21 = flow.dispatch.workgroups[%11, %10, %15, %14, %18, %19](%12, %16, %20, %11, %10, %15, %14, %18, %19) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%11, %10}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%15, %14}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19}, index, index, index, index, index, index) -> %20{%18, %19} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
%24 = flow.dispatch.workload.ordinal %arg6, 0 : index
%25 = flow.dispatch.workload.ordinal %arg7, 1 : index
%26 = flow.dispatch.workload.ordinal %arg8, 2 : index
%27 = flow.dispatch.workload.ordinal %arg9, 3 : index
%28 = flow.dispatch.workload.ordinal %arg10, 4 : index
%29 = flow.dispatch.workload.ordinal %arg11, 5 : index
%30 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25}
%31 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%32 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29}
%33 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%34 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%35 = flow.dispatch.tensor.load %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%36 = linalg.matmul ins(%33, %34 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%35 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %36, %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%22 = flow.dispatch.workgroups[%18, %19, %6, %7](%21, %18, %19, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
%24 = flow.dispatch.workload.ordinal %arg4, 0 : index
%25 = flow.dispatch.workload.ordinal %arg5, 1 : index
%26 = flow.dispatch.workload.ordinal %arg6, 2 : index
%27 = flow.dispatch.workload.ordinal %arg7, 3 : index
%28 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27}
%30 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%31 = iree_linalg_ext.unset_encoding %30 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %31[0, 0] [%26, %27] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %extracted_slice, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
flow.return %x, %y, %z : index, index, index
}
%23 = hal.tensor.export %22 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
util.return %23 : !hal.buffer_view
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
%9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%c16 = arith.constant 16 : index
%10 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16, %1]
%11 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16, %0]
%12 = flow.dispatch.workgroups[%c16, %c16, %0, %1, %0, %1](%c16, %c16, %2, %0, %1, %0, %1) : (index, index, tensor<?x?xf32>{%0, %1}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%c16_0 = arith.constant 16 : index
%14 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_0, %4]
%15 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_0, %3]
%16 = flow.dispatch.workgroups[%c16_0, %c16_0, %3, %4, %3, %4](%c16_0, %c16_0, %5, %3, %4, %3, %4) : (index, index, tensor<?x?xf32>{%3, %4}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index
%c16_1 = arith.constant 16 : index
%18 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_1, %6]
%19 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_1, %7]
%20 = flow.dispatch.workgroups[%c16_1, %c16_1, %6, %7, %6, %7](%c16_1, %c16_1, %8, %6, %7, %6, %7) : (index, index, tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32>
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25]
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24]
%padded = tensor.pad %32 low[0, 0] high[%34, %33] {
^bb0(%arg11: index, %arg12: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%21 = flow.dispatch.workgroups[%0, %1, %3, %4, %6, %7](%12, %16, %20, %0, %1, %3, %4, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index, index, index) -> %20{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
%24 = flow.dispatch.workload.ordinal %arg6, 0 : index
%25 = flow.dispatch.workload.ordinal %arg7, 1 : index
%26 = flow.dispatch.workload.ordinal %arg8, 2 : index
%27 = flow.dispatch.workload.ordinal %arg9, 3 : index
%28 = flow.dispatch.workload.ordinal %arg10, 4 : index
%29 = flow.dispatch.workload.ordinal %arg11, 5 : index
%30 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25}
%31 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27}
%32 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29}
%33 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%34 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%35 = flow.dispatch.tensor.load %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%36 = linalg.matmul ins(%33, %34 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%35 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %36, %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%22 = flow.dispatch.workgroups[%6, %7, %6, %7](%21, %6, %7, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
%24 = flow.dispatch.workload.ordinal %arg4, 0 : index
%25 = flow.dispatch.workload.ordinal %arg5, 1 : index
%26 = flow.dispatch.workload.ordinal %arg6, 2 : index
%27 = flow.dispatch.workload.ordinal %arg7, 3 : index
%28 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25}
%29 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27}
%30 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%31 = iree_linalg_ext.unset_encoding %30 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %31[0, 0] [%26, %27] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %extracted_slice, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
flow.return %x, %y, %z : index, index, index
}
%23 = hal.tensor.export %22 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
util.return %23 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%c16 = arith.constant 16 : index
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
%9 = flow.dispatch.workgroups[%c16, %0, %1](%c16, %2, %0, %1) : (index, tensor<?x?xf32>{%0, %1}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16}
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32>
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16]
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15]
%padded = tensor.pad %20 low[0, 0] high[%22, %21] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
flow.return %x, %y, %z : index, index, index
}
%10 = flow.dispatch.workgroups[%c16, %3, %4](%c16, %5, %3, %4) : (index, tensor<?x?xf32>{%3, %4}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16}
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32>
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16]
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15]
%padded = tensor.pad %20 low[0, 0] high[%22, %21] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
flow.return %x, %y, %z : index, index, index
}
%11 = flow.dispatch.workgroups[%c16, %6, %7](%c16, %8, %6, %7) : (index, tensor<?x?xf32>{%6, %7}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16}
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32>
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16]
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15]
%padded = tensor.pad %20 low[0, 0] high[%22, %21] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
flow.return %x, %y, %z : index, index, index
}
%12 = flow.dispatch.workgroups[%0, %1, %3, %4, %6, %7](%9, %10, %11, %0, %1, %3, %4, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index, index, index) -> %11{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
%15 = flow.dispatch.workload.ordinal %arg6, 0 : index
%16 = flow.dispatch.workload.ordinal %arg7, 1 : index
%17 = flow.dispatch.workload.ordinal %arg8, 2 : index
%18 = flow.dispatch.workload.ordinal %arg9, 3 : index
%19 = flow.dispatch.workload.ordinal %arg10, 4 : index
%20 = flow.dispatch.workload.ordinal %arg11, 5 : index
%21 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
%22 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%17, %18}
%23 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20}
%24 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%25 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%17, %18], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%17, %18} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%26 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%27 = linalg.matmul ins(%24, %25 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%26 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
flow.dispatch.tensor.store %27, %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20}
flow.return
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
flow.return %x, %y, %z : index, index, index
}
%13 = flow.dispatch.workgroups[%6, %7](%12, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index) -> tensor<?x?xf32>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} =
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
%15 = flow.dispatch.workload.ordinal %arg4, 0 : index
%16 = flow.dispatch.workload.ordinal %arg5, 1 : index
%17 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16}
%18 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%15, %16}
%19 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
%20 = iree_linalg_ext.unset_encoding %19 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %20[0, 0] [%15, %16] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %extracted_slice, %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%15, %16}
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%14 = hal.tensor.export %13 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
// -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+sahf,+lzcnt,+movbe,+x87,+pku,+prfchw,+rdrnd,+rdseed,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-linux-gnu"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_system_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0 {
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1 {
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2 {
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3 {
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
flow.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3}
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5}
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%12 = linalg.matmul ins(%9, %10 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %12, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5}
return
}
}
}
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4 {
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
flow.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%5 = iree_linalg_ext.unset_encoding %4 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %5[0, 0] [%0, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %extracted_slice, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
return
}
}
}
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%c16 = arith.constant 16 : index
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
%9 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD[%c16, %0, %1](%c16, %2, %0, %1) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%0, %1}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1}
%10 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD[%c16, %3, %4](%c16, %5, %3, %4) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%3, %4}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%3, %4}
%11 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD[%c16, %6, %7](%c16, %8, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%6, %7}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7}
%12 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32[%0, %1, %3, %4, %6, %7](%9, %10, %11, %0, %1, %3, %4, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7}, index, index, index, index, index, index) -> %11{%6, %7}
%13 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD[%6, %7](%12, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7}, index, index) -> tensor<?x?xf32>{%6, %7}
%14 = hal.tensor.export %13 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
util.return %14 : !hal.buffer_view
}
}
// -----// IR Dump Before VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+sahf,+lzcnt,+movbe,+x87,+pku,+prfchw,+rdrnd,+rdseed,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-linux-gnu"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_system_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0 {
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1 {
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2 {
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%6 = affine.apply #map3()[%2, %1]
%7 = affine.apply #map3()[%2, %0]
%padded = tensor.pad %5 low[0, 0] high[%7, %6] {
^bb0(%arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<?x?xf32>
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
return
}
}
}
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3 {
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
stream.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
%c0 = arith.constant 0 : index
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3}
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5}
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%12 = linalg.matmul ins(%9, %10 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
flow.dispatch.tensor.store %12, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5}
return
}
}
}
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4 {
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
stream.return %x, %y, %z : index, index, index
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>}
builtin.module {
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index
%2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1}
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
%5 = iree_linalg_ext.unset_encoding %4 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32>
%extracted_slice = tensor.extract_slice %5[0, 0] [%0, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %extracted_slice, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
return
}
}
}
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%c16 = arith.constant 16 : index
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
%element_type_f32_0 = hal.element_type<f32> : i32
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1)
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
%element_type_f32_2 = hal.element_type<f32> : i32
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3)
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
%c0 = arith.constant 0 : index
%15 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index
%16 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD[%c16, %0, %1](%c16, %4[%c0 to %2 for %2], %0, %1) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%2}, index, index) -> !stream.resource<*>{%15}
%c0_4 = arith.constant 0 : index
%17 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%5, %6} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index
%18 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD[%c16, %5, %6](%c16, %9[%c0_4 to %7 for %7], %5, %6) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%7}, index, index) -> !stream.resource<*>{%17}
%c0_5 = arith.constant 0 : index
%19 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%10, %11} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index
%20 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD[%c16, %10, %11](%c16, %14[%c0_5 to %12 for %12], %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%12}, index, index) -> !stream.resource<*>{%19}
%c0_6 = arith.constant 0 : index
%21 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32[%0, %1, %5, %6, %10, %11](%16[%c0_6 to %15 for %15], %18[%c0_6 to %17 for %17], %20[%c0_6 to %19 for %19], %0, %1, %5, %6, %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (!stream.resource<*>{%15}, !stream.resource<*>{%17}, !stream.resource<*>{%19}, index, index, index, index, index, index) -> %20{%19}
%c0_7 = arith.constant 0 : index
%22 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index
%23 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD[%10, %11](%21[%c0_7 to %19 for %19], %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (!stream.resource<*>{%19}, index, index) -> !stream.resource<*>{%22}
%24 = stream.async.transfer %23 : !stream.resource<*>{%22} -> !stream.resource<external>{%22}
%25 = stream.tensor.export %24 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%22} -> !hal.buffer_view
util.return %25 : !hal.buffer_view
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment