-
-
Save hanhanW/8d4f77c6903ca773c6f60098b5e541b1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump Before SetEncodingHintOnDispatches (iree-flow-set-encoding-hint-on-dispatches) //----- // | |
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} | |
%9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%10 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%9#1, %1] | |
%11 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%9#0, %0] | |
%12 = flow.dispatch.workgroups[%9#1, %9#0, %0, %1, %11, %10](%9#1, %9#0, %2, %0, %1, %11, %10) : (index, index, tensor<?x?xf32>{%0, %1}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%11, %10} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%14 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%13#1, %4] | |
%15 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%13#0, %3] | |
%16 = flow.dispatch.workgroups[%13#1, %13#0, %3, %4, %15, %14](%13#1, %13#0, %5, %3, %4, %15, %14) : (index, index, tensor<?x?xf32>{%3, %4}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%15, %14} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%18 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%17#0, %6] | |
%19 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%17#1, %7] | |
%20 = flow.dispatch.workgroups[%17#1, %17#0, %6, %7, %18, %19](%17#1, %17#0, %8, %6, %7, %18, %19) : (index, index, tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = flow.dispatch.workgroups[%11, %10, %15, %14, %18, %19](%12, %16, %20, %11, %10, %15, %14, %18, %19) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%11, %10}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%15, %14}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19}, index, index, index, index, index, index) -> %20{%18, %19} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%24 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%28 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%29 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%30 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} | |
%31 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%32 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} | |
%33 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%34 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%35 = flow.dispatch.tensor.load %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%36 = linalg.matmul ins(%33, %34 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%35 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %36, %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%22 = flow.dispatch.workgroups[%18, %19, %6, %7](%21, %18, %19, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%18, %19}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { | |
%24 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%25 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%26 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%27 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%28 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27} | |
%30 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%31 = iree_linalg_ext.unset_encoding %30 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32> | |
%extracted_slice = tensor.extract_slice %31[0, 0] [%26, %27] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> | |
flow.dispatch.tensor.store %extracted_slice, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%23 = hal.tensor.export %22 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view | |
util.return %23 : !hal.buffer_view | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} | |
%9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%c16 = arith.constant 16 : index | |
%10 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16, %1] | |
%11 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16, %0] | |
%12 = flow.dispatch.workgroups[%c16, %c16, %0, %1, %0, %1](%c16, %c16, %2, %0, %1, %0, %1) : (index, index, tensor<?x?xf32>{%0, %1}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%c16_0 = arith.constant 16 : index | |
%14 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_0, %4] | |
%15 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_0, %3] | |
%16 = flow.dispatch.workgroups[%c16_0, %c16_0, %3, %4, %3, %4](%c16_0, %c16_0, %5, %3, %4, %3, %4) : (index, index, tensor<?x?xf32>{%3, %4}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> index, index | |
%c16_1 = arith.constant 16 : index | |
%18 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_1, %6] | |
%19 = affine.apply affine_map<()[s0, s1] -> ((s1 ceildiv s0) * s0)>()[%c16_1, %7] | |
%20 = flow.dispatch.workgroups[%c16_1, %c16_1, %6, %7, %6, %7](%c16_1, %c16_1, %8, %6, %7, %6, %7) : (index, index, tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%24 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 4 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 5 : index | |
%28 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%30 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%31 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%32 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%24, %25} -> tensor<?x?xf32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%30, %25] | |
%34 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%31, %24] | |
%padded = tensor.pad %32 low[0, 0] high[%34, %33] { | |
^bb0(%arg11: index, %arg12: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%35 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %35, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = flow.dispatch.workgroups[%0, %1, %3, %4, %6, %7](%12, %16, %20, %0, %1, %3, %4, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index, index, index) -> %20{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%24 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%25 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%26 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%27 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%28 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%29 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%30 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} | |
%31 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} | |
%32 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} | |
%33 = flow.dispatch.tensor.load %30, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%34 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%26, %27} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%35 = flow.dispatch.tensor.load %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%36 = linalg.matmul ins(%33, %34 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%35 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %36, %32, offsets = [0, 0], sizes = [%28, %29], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%28, %29} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%22 = flow.dispatch.workgroups[%6, %7, %6, %7](%21, %6, %7, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { | |
%24 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%25 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%26 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%27 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%28 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} | |
%29 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27} | |
%30 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [%24, %25], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%24, %25} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%31 = iree_linalg_ext.unset_encoding %30 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32> | |
%extracted_slice = tensor.extract_slice %31[0, 0] [%26, %27] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> | |
flow.dispatch.tensor.store %extracted_slice, %29, offsets = [0, 0], sizes = [%26, %27], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%26, %27} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%23 = hal.tensor.export %22 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view | |
util.return %23 : !hal.buffer_view | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { | |
%c16 = arith.constant 16 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} | |
%9 = flow.dispatch.workgroups[%c16, %0, %1](%c16, %2, %0, %1) : (index, tensor<?x?xf32>{%0, %1}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} | |
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32> | |
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16] | |
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15] | |
%padded = tensor.pad %20 low[0, 0] high[%22, %21] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%10 = flow.dispatch.workgroups[%c16, %3, %4](%c16, %5, %3, %4) : (index, tensor<?x?xf32>{%3, %4}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} | |
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32> | |
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16] | |
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15] | |
%padded = tensor.pad %20 low[0, 0] high[%22, %21] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%11 = flow.dispatch.workgroups[%c16, %6, %7](%c16, %8, %6, %7) : (index, tensor<?x?xf32>{%6, %7}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: index, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%15 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%16 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%17 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%18 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} | |
%19 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
%20 = flow.dispatch.tensor.load %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%15, %16} -> tensor<?x?xf32> | |
%21 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %16] | |
%22 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%17, %15] | |
%padded = tensor.pad %20 low[0, 0] high[%22, %21] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%23 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %23, %19, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%12 = flow.dispatch.workgroups[%0, %1, %3, %4, %6, %7](%9, %10, %11, %0, %1, %3, %4, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index, index, index, index, index) -> %11{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%15 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%16 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%17 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%18 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%19 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%20 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%21 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
%22 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%17, %18} | |
%23 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20} | |
%24 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%25 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%17, %18], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%17, %18} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%26 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%27 = linalg.matmul ins(%24, %25 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%26 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
flow.dispatch.tensor.store %27, %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%19, %20} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%13 = flow.dispatch.workgroups[%6, %7](%12, %6, %7) : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>{%6, %7}, index, index) -> tensor<?x?xf32>{%6, %7} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { | |
%15 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%16 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%17 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} | |
%18 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%15, %16} | |
%19 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%15, %16} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> | |
%20 = iree_linalg_ext.unset_encoding %19 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32> | |
%extracted_slice = tensor.extract_slice %20[0, 0] [%15, %16] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> | |
flow.dispatch.tensor.store %extracted_slice, %18, offsets = [0, 0], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%15, %16} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%14 = hal.tensor.export %13 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view | |
util.return %14 : !hal.buffer_view | |
} | |
// -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+sahf,+lzcnt,+movbe,+x87,+pku,+prfchw,+rdrnd,+rdseed,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-linux-gnu"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_system_elf_x86_64_]> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0 { | |
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
flow.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1 { | |
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
flow.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2 { | |
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
flow.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD(%arg0: index, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3 { | |
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index | |
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index | |
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} | |
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} | |
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%12 = linalg.matmul ins(%9, %10 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %12, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} | |
return | |
} | |
} | |
} | |
flow.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4 { | |
flow.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1 | |
flow.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%5 = iree_linalg_ext.unset_encoding %4 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32> | |
%extracted_slice = tensor.extract_slice %5[0, 0] [%0, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> | |
flow.dispatch.tensor.store %extracted_slice, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} | |
return | |
} | |
} | |
} | |
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { | |
%c16 = arith.constant 16 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} | |
%9 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD[%c16, %0, %1](%c16, %2, %0, %1) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%0, %1}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1} | |
%10 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD[%c16, %3, %4](%c16, %5, %3, %4) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%3, %4}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%3, %4} | |
%11 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD[%c16, %6, %7](%c16, %8, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, tensor<?x?xf32>{%6, %7}, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7} | |
%12 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32[%0, %1, %3, %4, %6, %7](%9, %10, %11, %0, %1, %3, %4, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%3, %4}, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7}, index, index, index, index, index, index) -> %11{%6, %7} | |
%13 = flow.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD[%6, %7](%12, %6, %7) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%6, %7}, index, index) -> tensor<?x?xf32>{%6, %7} | |
%14 = hal.tensor.export %13 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view | |
util.return %14 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump Before VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "+cmov,+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+sahf,+lzcnt,+movbe,+x87,+pku,+prfchw,+rdrnd,+rdseed,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-linux-gnu"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_system_elf_x86_64_]> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0 { | |
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1 { | |
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2 { | |
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD(%arg0: index, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%2 = flow.dispatch.workload.ordinal %arg0, 0 : index | |
%3 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} | |
%4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32> | |
%6 = affine.apply #map3()[%2, %1] | |
%7 = affine.apply #map3()[%2, %0] | |
%padded = tensor.pad %5 low[0, 0] high[%7, %6] { | |
^bb0(%arg5: index, %arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<?x?xf32> | |
%8 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
return | |
} | |
} | |
} | |
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3 { | |
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 | |
stream.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index | |
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index | |
%6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} | |
%8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} | |
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%2, %3} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%12 = linalg.matmul ins(%9, %10 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
flow.dispatch.tensor.store %12, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%4, %5} | |
return | |
} | |
} | |
} | |
stream.executable private @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4 { | |
stream.executable.export public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1 | |
stream.return %x, %y, %z : index, index, index | |
} attributes {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} | |
builtin.module { | |
func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>>{%0, %1} -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> | |
%5 = iree_linalg_ext.unset_encoding %4 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32> | |
%extracted_slice = tensor.extract_slice %5[0, 0] [%0, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> | |
flow.dispatch.tensor.store %extracted_slice, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} | |
return | |
} | |
} | |
} | |
util.func public @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} { | |
%c16 = arith.constant 16 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%element_type_f32 = hal.element_type<f32> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major) | |
%2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index | |
%3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2} | |
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%element_type_f32_0 = hal.element_type<f32> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1) | |
%7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index | |
%8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7} | |
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7} | |
%10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%element_type_f32_2 = hal.element_type<f32> : i32 | |
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3) | |
%12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index | |
%13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} | |
%14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12} | |
%c0 = arith.constant 0 : index | |
%15 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%0, %1} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index | |
%16 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_0_set_encoding_LHS_DxD[%c16, %0, %1](%c16, %4[%c0 to %2 for %2], %0, %1) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%2}, index, index) -> !stream.resource<*>{%15} | |
%c0_4 = arith.constant 0 : index | |
%17 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%5, %6} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index | |
%18 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_1_set_encoding_RHS_DxD[%c16, %5, %6](%c16, %9[%c0_4 to %7 for %7], %5, %6) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%7}, index, index) -> !stream.resource<*>{%17} | |
%c0_5 = arith.constant 0 : index | |
%19 = stream.tensor.sizeof tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>{%10, %11} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index | |
%20 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_2_set_encoding_RESULT_DxD[%c16, %10, %11](%c16, %14[%c0_5 to %12 for %12], %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (index, !stream.resource<*>{%12}, index, index) -> !stream.resource<*>{%19} | |
%c0_6 = arith.constant 0 : index | |
%21 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_3_matmul_DxDxD_f32[%0, %1, %5, %6, %10, %11](%16[%c0_6 to %15 for %15], %18[%c0_6 to %17 for %17], %20[%c0_6 to %19 for %19], %0, %1, %5, %6, %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (!stream.resource<*>{%15}, !stream.resource<*>{%17}, !stream.resource<*>{%19}, index, index, index, index, index, index) -> %20{%19} | |
%c0_7 = arith.constant 0 : index | |
%22 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : index | |
%23 = stream.async.dispatch @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4::@matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_dispatch_4_unset_encoding_RESULT_DxD[%10, %11](%21[%c0_7 to %19 for %19], %10, %11) {encoding.round_dims_to = #iree_codegen.encoding.round_dims_to<16>} : (!stream.resource<*>{%19}, index, index) -> !stream.resource<*>{%22} | |
%24 = stream.async.transfer %23 : !stream.resource<*>{%22} -> !stream.resource<external>{%22} | |
%25 = stream.tensor.export %24 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%22} -> !hal.buffer_view | |
util.return %25 : !hal.buffer_view | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment