-
-
Save vmurali/bed162b754c0399b3b25e0db59227a62 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
func.func private @_split_reduction_pass2() { | |
%0 = util.unfoldable_constant dense<1> : tensor<512x256x128xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%2 = linalg.fill ins(%c0_i32 : i32) outs(%1 : tensor<512x256xi32>) -> tensor<512x256xi32> | |
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%4 = arith.addi %arg0, %arg1 : i32 | |
linalg.yield %4 : i32 | |
} -> tensor<512x256xi32> | |
check.expect_eq_const(%3, dense<128> : tensor<512x256xi32>) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
func.func private @_split_reduction_pass2() { | |
%cst = arith.constant dense<128> : tensor<512x256xi32> | |
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32> | |
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32> | |
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.addi %arg0, %arg1 : i32 | |
linalg.yield %3 : i32 | |
} -> tensor<512x256xi32> | |
check.expect_eq(%2, %cst) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Inliner (inline) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
func.func private @_split_reduction_pass2() { | |
%cst = arith.constant dense<128> : tensor<512x256xi32> | |
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32> | |
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32> | |
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.addi %arg0, %arg1 : i32 | |
linalg.yield %3 : i32 | |
} -> tensor<512x256xi32> | |
check.expect_eq(%2, %cst) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After DispatchLinalgOnTensors (iree-flow-dispatch-linalg-on-tensors-pass) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
func.func private @_split_reduction_pass2() { | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant dense<128> : tensor<512x256xi32> | |
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32> | |
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32> | |
%1 = flow.dispatch.workgroups[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32> = | |
(%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
flow.return | |
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
flow.executable private @_split_reduction_pass2_dispatch_0 { | |
flow.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%3 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %3 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %2, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant dense<128> : tensor<512x256xi32> | |
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32> | |
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32> | |
%1 = flow.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32> | |
check.expect_eq(%1, %cst) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ConvertToStream (iree-stream-conversion) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%cst = stream.tensor.constant : tensor<512x256xi32> in !stream.resource<constant> = dense<128> : tensor<512x256xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0} | |
%cst_0 = stream.tensor.constant : tensor<512x256x128xi32> in !stream.resource<constant> = dense<1> : tensor<512x256x128xi32> | |
%2 = stream.resource.size %cst_0 : !stream.resource<constant> | |
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2} | |
%4 = util.do_not_optimize(%3) : !stream.resource<*> | |
%5 = stream.resource.size %4 : !stream.resource<*> | |
%6 = stream.tensor.sizeof tensor<512x256xi32> : index | |
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} | |
%9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32> | |
%10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
%11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32> | |
check.expect_eq(%9, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.tensor.sizeof tensor<512x256xi32> : index | |
%1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof tensor<512x256x128xi32> : index | |
%3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2} | |
%4 = util.do_not_optimize(%3) : !stream.resource<*> | |
%5 = stream.resource.size %4 : !stream.resource<*> | |
%6 = stream.tensor.sizeof tensor<512x256xi32> : index | |
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} | |
%9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32> | |
%10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
%11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32> | |
check.expect_eq(%9, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.tensor.sizeof tensor<512x256xi32> : index | |
%1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof tensor<512x256x128xi32> : index | |
%3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2} | |
%4 = util.do_not_optimize(%3) : !stream.resource<*> | |
%5 = stream.resource.size %4 : !stream.resource<*> | |
%6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%0} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32> | |
%9 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
%10 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32> | |
check.expect_eq(%8, %10) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288} | |
%1 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864} | |
%2 = util.do_not_optimize(%1) : !stream.resource<*> | |
%3 = stream.resource.size %2 : !stream.resource<*> | |
%4 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%2) : (!stream.resource<*>{%3}) -> !stream.resource<*>{%c524288} | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288} | |
%6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%7 = stream.async.transfer %0 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288} | |
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%6, %8) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<*> | |
%2 = stream.resource.size %1 : !stream.resource<*> | |
%3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<*>{%2}) -> !stream.resource<*>{%c524288} | |
%4 = stream.async.transfer %3 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288} | |
%5 = stream.tensor.export %4 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%6 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288} | |
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%5, %8) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After RefineUsage (iree-stream-refine-usage) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<transient> | |
%2 = stream.resource.size %1 : !stream.resource<transient> | |
%3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288} | |
%4 = stream.tensor.export %3 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%5 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288} | |
%6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%4, %6) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} { | |
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864} | |
stream.yield %6 : !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<transient> | |
%2 = stream.resource.size %1 : !stream.resource<transient> | |
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg0) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288} | |
%7 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288} | |
stream.yield %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} => !stream.timepoint | |
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%4, %5) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} { | |
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864} | |
stream.yield %6 : !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<transient> | |
%2 = stream.resource.size %1 : !stream.resource<transient> | |
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288} | |
%8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288} | |
stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} | |
stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} => !stream.timepoint | |
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%4, %5) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} { | |
%8 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864} | |
stream.yield %8 : !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<transient> | |
%2 = stream.resource.size %1 : !stream.resource<transient> | |
%3 = stream.timepoint.immediate => !stream.timepoint | |
%4 = stream.timepoint.immediate => !stream.timepoint | |
%results_0:2, %result_timepoint_1 = stream.async.execute await(%4) => with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%8:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%9 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288} | |
%10 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288} | |
stream.yield %9, %10 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} | |
stream.yield %8#0, %8#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} => !stream.timepoint | |
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%6 = stream.tensor.export %5#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%7 = stream.tensor.export %5#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%6, %7) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} { | |
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864} | |
stream.yield %6 : !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864} | |
%1 = util.do_not_optimize(%0) : !stream.resource<transient> | |
%2 = stream.resource.size %1 : !stream.resource<transient> | |
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) { | |
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288} | |
%8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288} | |
stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} | |
stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
} => !stream.timepoint | |
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%4, %5) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%c0 = arith.constant 0 : index | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%c0_0 = arith.constant 0 : index | |
%5:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5#0 as %arg1: !stream.resource<external>{%c524288}, %5#1 as %arg2: !stream.resource<external>{%c524288}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288} | |
} | |
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288} | |
} | |
} => !stream.timepoint | |
%7:2 = stream.timepoint.await %6 => %5#0, %5#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%8 = stream.tensor.export %7#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%9 = stream.tensor.export %7#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%8, %9) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%c0 = arith.constant 0 : index | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%c0_0 = arith.constant 0 : index | |
%5:3 = stream.resource.pack slices({ | |
[0, 0] = %c524288, | |
[0, 0] = %c524288 | |
}) : index | |
%6 = stream.resource.alloc uninitialized : !stream.resource<external>{%5#0} | |
%7 = stream.resource.subview %6[%5#1] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288} | |
%8 = stream.resource.subview %6[%5#2] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288} | |
%9 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %7 as %arg1: !stream.resource<external>{%c524288}, %8 as %arg2: !stream.resource<external>{%c524288}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288} | |
} | |
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288} | |
} | |
} => !stream.timepoint | |
%10:2 = stream.timepoint.await %9 => %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%11 = stream.tensor.export %10#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%12 = stream.tensor.export %10#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%11, %12) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%c0 = arith.constant 0 : index | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%c0_0 = arith.constant 0 : index | |
%c0_1 = arith.constant 0 : index | |
%c524288_2 = arith.constant 524288 : index | |
%c524288_3 = arith.constant 524288 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c1048576_4 = arith.constant 1048576 : index | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_4} | |
%6 = stream.resource.subview %5[%c0_1] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288} | |
%7 = stream.resource.subview %5[%c524288_3] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288} | |
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288} | |
} | |
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288} | |
} | |
} => !stream.timepoint | |
%9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%c0_0 = arith.constant 0 : index | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0_0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%c0_1 = arith.constant 0 : index | |
%c0_2 = arith.constant 0 : index | |
%c524288_3 = arith.constant 524288 : index | |
%c524288_4 = arith.constant 524288 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c1048576_5 = arith.constant 1048576 : index | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_5} | |
%6 = stream.resource.subview %5[%c0_2] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288} | |
%7 = stream.resource.subview %5[%c524288_4] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288} | |
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0_1 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_1 for %c524288] : !stream.resource<external>{%c524288} | |
} | |
stream.cmd.fill %c128_i32, %arg2[%c0_1 for %c524288] : i32 -> !stream.resource<external>{%c524288} | |
} | |
} => !stream.timepoint | |
%9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c524288] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index) { | |
%c0 = arith.constant 0 : index | |
%0 = arith.addi %c0, %arg2 : index | |
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = arith.addi %c0, %arg3 : index | |
%3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%5 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%6 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<512x256x128xi32>) outs(%5 : tensor<512x256xi32>) { | |
^bb0(%arg4: i32, %arg5: i32): | |
%7 = arith.addi %arg4, %arg5 : i32 | |
linalg.yield %7 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %6, %3, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%c0_0 = arith.constant 0 : index | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0, %c0 : index, index) { | |
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) { | |
%0 = arith.index_cast %arg2 : i32 to index | |
%1 = arith.index_cast %arg3 : i32 to index | |
%c0 = arith.constant 0 : index | |
%2 = arith.addi %c0, %0 : index | |
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%4 = arith.addi %c0, %1 : index | |
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) { | |
^bb0(%arg4: i32, %arg5: i32): | |
%9 = arith.addi %arg4, %arg5 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%c0_0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i32_1 = arith.constant 0 : i32 | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32_1 : i32, i32) { | |
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) { | |
%0 = arith.index_cast %arg2 : i32 to index | |
%1 = arith.index_cast %arg3 : i32 to index | |
%c0 = arith.constant 0 : index | |
%2 = arith.addi %c0, %0 : index | |
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%4 = arith.addi %c0, %1 : index | |
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) { | |
^bb0(%arg4: i32, %arg5: i32): | |
%9 = arith.addi %arg4, %arg5 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%c0_i32 = arith.constant 0 : i32 | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32 : i32, i32) { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) { | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = arith.index_cast %c0_i32 : i32 to index | |
%1 = arith.index_cast %c0_i32 : i32 to index | |
%c0 = arith.constant 0 : index | |
%2 = arith.addi %c0, %0 : index | |
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%4 = arith.addi %c0, %1 : index | |
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%9 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%c0_i32 = arith.constant 0 : i32 | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) { | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = arith.index_cast %c0_i32 : i32 to index | |
%1 = arith.index_cast %c0_i32 : i32 to index | |
%c0 = arith.constant 0 : index | |
%2 = arith.addi %c0, %0 : index | |
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%4 = arith.addi %c0, %1 : index | |
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%9 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%c0_i32 = arith.constant 0 : i32 | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) { | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = arith.index_cast %c0_i32 : i32 to index | |
%1 = arith.index_cast %c0_i32 : i32 to index | |
%c0 = arith.constant 0 : index | |
%2 = arith.addi %c0, %0 : index | |
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%4 = arith.addi %c0, %1 : index | |
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%9 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- // | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
stream.executable private @_split_reduction_pass2_dispatch_0 { | |
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%5 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32> | |
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32> | |
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%5 = arith.addi %arg0, %arg1 : i32 | |
linalg.yield %5 : i32 | |
} -> tensor<512x256xi32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32> | |
%7 = linalg.init_tensor [32, 32] : tensor<32x32xi32> | |
%8 = tensor.cast %6 : tensor<?x?x128xi32> to tensor<32x32x128xi32> | |
%9 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg2: i32, %arg3: i32): | |
%11 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %11 : i32 | |
} -> tensor<32x32xi32> | |
%10 = tensor.cast %9 : tensor<32x32xi32> to tensor<?x?xi32> | |
flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%c32_0 = arith.constant 32 : index | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32_0, %c32_0], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32> | |
%7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32> | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32> | |
%9 = linalg.init_tensor [32, 32] : tensor<32x32xi32> | |
%10 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32> | |
%11 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg2: i32, %arg3: i32): | |
%13 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %13 : i32 | |
} -> tensor<32x32xi32> | |
%12 = tensor.cast %11 : tensor<32x32xi32> to tensor<?x?xi32> | |
flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32> | |
%7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32> | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32> | |
%9 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32> | |
%10 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg2: i32, %arg3: i32): | |
%12 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %12 : i32 | |
} -> tensor<32x32xi32> | |
%11 = tensor.cast %10 : tensor<32x32xi32> to tensor<?x?xi32> | |
flow.dispatch.tensor.store %11, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<32x32x128xi32>) outs(%6 : tensor<32x32xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg2: i32, %arg3: i32): | |
%9 = arith.addi %arg2, %arg3 : i32 | |
linalg.yield %9 : i32 | |
} -> tensor<32x32xi32> | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32> | |
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32> | |
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #config} { | |
^bb0(%arg6: i32, %arg7: i32): | |
%14 = arith.addi %arg6, %arg7 : i32 | |
linalg.yield %14 : i32 | |
} -> tensor<1x4xi32> | |
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32> | |
scf.yield %13 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32> | |
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32> | |
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg6: i32, %arg7: i32): | |
%14 = arith.addi %arg6, %arg7 : i32 | |
linalg.yield %14 : i32 | |
} -> tensor<1x4xi32> | |
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32> | |
scf.yield %13 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]> | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32> | |
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32> | |
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {lowering_config = #config} { | |
^bb0(%arg6: i32, %arg7: i32): | |
%14 = arith.addi %arg6, %arg7 : i32 | |
linalg.yield %14 : i32 | |
} -> tensor<1x4xi32> | |
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32> | |
scf.yield %13 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgSplitReduction (linalg-split-reduction) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> | |
#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32> | |
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32> | |
%12 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) { | |
%14 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) { | |
%15 = tensor.extract_slice %10[%arg6, %arg8, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x1x128xi32> | |
%16 = tensor.extract_slice %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<1x1xi32> | |
%17 = tensor.expand_shape %15 [[0], [1], [2, 3]] : tensor<1x1x128xi32> into tensor<1x1x32x4xi32> | |
%18 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32> | |
%19 = linalg.fill ins(%c0_i32 : i32) outs(%18 : tensor<1x1x4xi32>) -> tensor<1x1x4xi32> | |
%20 = scf.for %arg10 = %c0 to %c32 step %c1 iter_args(%arg11 = %19) -> (tensor<1x1x4xi32>) { | |
%23 = tensor.extract_slice %17[0, 0, %arg10, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x32x4xi32> to tensor<1x1x1x4xi32> | |
%24 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%23 : tensor<1x1x1x4xi32>) outs(%arg11 : tensor<1x1x4xi32>) { | |
^bb0(%arg12: i32, %arg13: i32): | |
%25 = arith.addi %arg12, %arg13 : i32 | |
linalg.yield %25 : i32 | |
} -> tensor<1x1x4xi32> | |
scf.yield %24 : tensor<1x1x4xi32> | |
} | |
%21 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%20 : tensor<1x1x4xi32>) outs(%16 : tensor<1x1xi32>) { | |
^bb0(%arg10: i32, %arg11: i32): | |
%23 = arith.addi %arg10, %arg11 : i32 | |
linalg.yield %23 : i32 | |
} -> tensor<1x1xi32> | |
%22 = tensor.insert_slice %21 into %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x1xi32> into tensor<1x4xi32> | |
scf.yield %22 : tensor<1x4xi32> | |
} | |
scf.yield %14 : tensor<1x4xi32> | |
} | |
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32> | |
scf.yield %13 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0) -> (d0)> | |
#map2 = affine_map<(d0) -> ()> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32> | |
%11 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32> | |
%12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32> | |
%13 = tensor.expand_shape %12 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32> | |
%14 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %13) -> (tensor<1x4xi32>) { | |
%17 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) { | |
%18 = tensor.extract_slice %11[%arg6, %arg8, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<128xi32> | |
%19 = tensor.extract_slice %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32> | |
%20 = tensor.expand_shape %18 [[0, 1, 2, 3]] : tensor<128xi32> into tensor<1x1x32x4xi32> | |
%21 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32> | |
%22 = linalg.fill ins(%c0_i32 : i32) outs(%21 : tensor<1x1x4xi32>) -> tensor<1x1x4xi32> | |
%23 = scf.for %arg10 = %c0 to %c32 step %c1 iter_args(%arg11 = %22) -> (tensor<1x1x4xi32>) { | |
%27 = tensor.extract_slice %20[0, 0, %arg10, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x32x4xi32> to tensor<4xi32> | |
%28 = tensor.collapse_shape %arg11 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%29 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%27 : tensor<4xi32>) outs(%28 : tensor<4xi32>) { | |
^bb0(%arg12: i32, %arg13: i32): | |
%31 = arith.addi %arg12, %arg13 : i32 | |
linalg.yield %31 : i32 | |
} -> tensor<4xi32> | |
%30 = tensor.expand_shape %29 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32> | |
scf.yield %30 : tensor<1x1x4xi32> | |
} | |
%24 = tensor.collapse_shape %23 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%25 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["reduction"]} ins(%24 : tensor<4xi32>) outs(%19 : tensor<i32>) { | |
^bb0(%arg10: i32, %arg11: i32): | |
%27 = arith.addi %arg10, %arg11 : i32 | |
linalg.yield %27 : i32 | |
} -> tensor<i32> | |
%26 = tensor.insert_slice %25 into %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32> | |
scf.yield %26 : tensor<1x4xi32> | |
} | |
scf.yield %17 : tensor<1x4xi32> | |
} | |
%15 = tensor.collapse_shape %14 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32> | |
%16 = tensor.insert_slice %15 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32> | |
scf.yield %16 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyVectorizePass (iree-linalg-strategy-vectorize-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_count_y] | |
%4 = affine.apply #map()[%workgroup_id_x] | |
%5 = affine.apply #map()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) { | |
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32> | |
%11 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32> | |
%12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32> | |
%13 = tensor.expand_shape %12 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32> | |
%14 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %13) -> (tensor<1x4xi32>) { | |
%17 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) { | |
%18 = tensor.extract_slice %11[%arg6, %arg8, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<128xi32> | |
%19 = tensor.extract_slice %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32> | |
%20 = tensor.expand_shape %18 [[0, 1, 2, 3]] : tensor<128xi32> into tensor<1x1x32x4xi32> | |
%21 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32> | |
%22 = vector.transfer_write %cst, %21[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32> | |
%23 = scf.for %arg10 = %c0 to %c32 step %c1 iter_args(%arg11 = %22) -> (tensor<1x1x4xi32>) { | |
%32 = tensor.collapse_shape %arg11 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%33 = vector.transfer_read %20[%c0, %c0, %arg10, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x32x4xi32>, vector<4xi32> | |
%34 = vector.transfer_read %32[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%35 = arith.addi %33, %34 : vector<4xi32> | |
%36 = vector.transfer_write %35, %32[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32> | |
%37 = tensor.expand_shape %36 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32> | |
scf.yield %37 : tensor<1x1x4xi32> | |
} | |
%24 = tensor.collapse_shape %23 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%25 = vector.transfer_read %24[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%26 = vector.transfer_read %19[], %c0_i32 : tensor<i32>, vector<i32> | |
%27 = vector.extractelement %26[] : vector<i32> | |
%28 = vector.multi_reduction <add>, %25, %27 [0] : vector<4xi32> to i32 | |
%29 = vector.broadcast %28 : i32 to vector<i32> | |
%30 = vector.transfer_write %29, %19[] : vector<i32>, tensor<i32> | |
%31 = tensor.insert_slice %30 into %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32> | |
scf.yield %31 : tensor<1x4xi32> | |
} | |
scf.yield %17 : tensor<1x4xi32> | |
} | |
%15 = tensor.collapse_shape %14 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32> | |
%16 = tensor.insert_slice %15 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32> | |
scf.yield %16 : tensor<32x32xi32> | |
} | |
scf.yield %9 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_count_y] | |
%4 = affine.apply #map()[%workgroup_id_x] | |
%5 = affine.apply #map()[%workgroup_count_x] | |
%6 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32> | |
%7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32> | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%9 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%10 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %8) -> (tensor<32x32xi32>) { | |
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%12 = tensor.extract_slice %9[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32> | |
%13 = tensor.expand_shape %12 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32> | |
%14 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32> | |
%15 = tensor.expand_shape %14 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32> | |
%16 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %15) -> (tensor<1x4xi32>) { | |
%19 = tensor.extract_slice %13[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<128xi32> | |
%20 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32> | |
%21 = tensor.expand_shape %19 [[0, 1, 2, 3]] : tensor<128xi32> into tensor<1x1x32x4xi32> | |
%22 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %7) -> (tensor<1x1x4xi32>) { | |
%31 = tensor.collapse_shape %arg9 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%32 = vector.transfer_read %21[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x32x4xi32>, vector<4xi32> | |
%33 = vector.transfer_read %31[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%34 = arith.addi %32, %33 : vector<4xi32> | |
%35 = vector.transfer_write %34, %31[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32> | |
%36 = tensor.expand_shape %35 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32> | |
scf.yield %36 : tensor<1x1x4xi32> | |
} | |
%23 = tensor.collapse_shape %22 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%24 = vector.transfer_read %23[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%25 = vector.transfer_read %20[], %c0_i32 : tensor<i32>, vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
%27 = vector.multi_reduction <add>, %24, %26 [0] : vector<4xi32> to i32 | |
%28 = vector.broadcast %27 : i32 to vector<i32> | |
%29 = vector.transfer_write %28, %20[] : vector<i32>, tensor<i32> | |
%30 = tensor.insert_slice %29 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32> | |
scf.yield %30 : tensor<1x4xi32> | |
} | |
%17 = tensor.collapse_shape %16 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32> | |
%18 = tensor.insert_slice %17 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32> | |
scf.yield %18 : tensor<32x32xi32> | |
} | |
scf.yield %11 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_count_y] | |
%4 = affine.apply #map()[%workgroup_id_x] | |
%5 = affine.apply #map()[%workgroup_count_x] | |
%6 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32> | |
%7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32> | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%9 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%10 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %8) -> (tensor<32x32xi32>) { | |
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%12 = tensor.extract_slice %9[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32> | |
%13 = tensor.expand_shape %12 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32> | |
%14 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32> | |
%15 = tensor.expand_shape %14 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32> | |
%16 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %15) -> (tensor<1x4xi32>) { | |
%19 = tensor.extract_slice %13[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<128xi32> | |
%20 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32> | |
%21 = tensor.expand_shape %19 [[0, 1, 2, 3]] : tensor<128xi32> into tensor<1x1x32x4xi32> | |
%22 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %7) -> (tensor<1x1x4xi32>) { | |
%31 = tensor.collapse_shape %arg9 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%32 = vector.transfer_read %21[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x32x4xi32>, vector<4xi32> | |
%33 = vector.transfer_read %31[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%34 = arith.addi %32, %33 : vector<4xi32> | |
%35 = vector.transfer_write %34, %31[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32> | |
%36 = tensor.expand_shape %35 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32> | |
scf.yield %36 : tensor<1x1x4xi32> | |
} | |
%23 = tensor.collapse_shape %22 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%24 = vector.transfer_read %23[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%25 = vector.transfer_read %20[], %c0_i32 : tensor<i32>, vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
%27 = vector.multi_reduction <add>, %24, %26 [0] : vector<4xi32> to i32 | |
%28 = vector.broadcast %27 : i32 to vector<i32> | |
%29 = vector.transfer_write %28, %20[] : vector<i32>, tensor<i32> | |
%30 = tensor.insert_slice %29 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32> | |
scf.yield %30 : tensor<1x4xi32> | |
} | |
%17 = tensor.collapse_shape %16 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32> | |
%18 = tensor.insert_slice %17 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32> | |
scf.yield %18 : tensor<32x32xi32> | |
} | |
scf.yield %11 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgInitTensorToAllocTensor (linalg-init-tensor-to-alloc-tensor) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_count_y] | |
%4 = affine.apply #map()[%workgroup_id_x] | |
%5 = affine.apply #map()[%workgroup_count_x] | |
%6 = bufferization.alloc_tensor() : tensor<1x1x4xi32> | |
%7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32> | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32> | |
%9 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32> | |
%10 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %8) -> (tensor<32x32xi32>) { | |
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) { | |
%12 = tensor.extract_slice %9[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32> | |
%13 = tensor.expand_shape %12 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32> | |
%14 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32> | |
%15 = tensor.expand_shape %14 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32> | |
%16 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %15) -> (tensor<1x4xi32>) { | |
%19 = tensor.extract_slice %13[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<128xi32> | |
%20 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32> | |
%21 = tensor.expand_shape %19 [[0, 1, 2, 3]] : tensor<128xi32> into tensor<1x1x32x4xi32> | |
%22 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %7) -> (tensor<1x1x4xi32>) { | |
%31 = tensor.collapse_shape %arg9 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%32 = vector.transfer_read %21[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x32x4xi32>, vector<4xi32> | |
%33 = vector.transfer_read %31[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%34 = arith.addi %32, %33 : vector<4xi32> | |
%35 = vector.transfer_write %34, %31[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32> | |
%36 = tensor.expand_shape %35 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32> | |
scf.yield %36 : tensor<1x1x4xi32> | |
} | |
%23 = tensor.collapse_shape %22 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32> | |
%24 = vector.transfer_read %23[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32> | |
%25 = vector.transfer_read %20[], %c0_i32 : tensor<i32>, vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
%27 = vector.multi_reduction <add>, %24, %26 [0] : vector<4xi32> to i32 | |
%28 = vector.broadcast %27 : i32 to vector<i32> | |
%29 = vector.transfer_write %28, %20[] : vector<i32>, tensor<i32> | |
%30 = tensor.insert_slice %29 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32> | |
scf.yield %30 : tensor<1x4xi32> | |
} | |
%17 = tensor.collapse_shape %16 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32> | |
%18 = tensor.insert_slice %17 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32> | |
scf.yield %18 : tensor<32x32xi32> | |
} | |
scf.yield %11 : tensor<32x32xi32> | |
} | |
flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32> | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<() -> ()> | |
#map3 = affine_map<(d0) -> (d0)> | |
#map4 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %2, 64 : memref<512x256xi32> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%4 = affine.apply #map0()[%workgroup_id_y] | |
%5 = affine.apply #map0()[%workgroup_count_y] | |
%6 = affine.apply #map0()[%workgroup_id_x] | |
%7 = affine.apply #map0()[%workgroup_count_x] | |
%8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
%9 = bufferization.to_tensor %8 : memref<1x1x4xi32> | |
vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%10 = bufferization.to_tensor %8 : memref<1x1x4xi32> | |
scf.for %arg0 = %4 to %c512 step %5 { | |
scf.for %arg1 = %6 to %c256 step %7 { | |
%11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%12 = bufferization.to_tensor %11 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%13 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
%14 = bufferization.to_tensor %13 : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
%15 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %11) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
%18 = bufferization.to_tensor %arg3 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%19 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
%21 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%22 = memref.subview %13[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%23 = bufferization.to_tensor %22 : memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%24 = memref.expand_shape %22 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%25 = bufferization.to_tensor %24 : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%26 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%27 = bufferization.to_tensor %26 : memref<4xi32, strided<[1], offset: ?>> | |
%28 = memref.expand_shape %26 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
%29 = bufferization.to_tensor %28 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
%30 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %28) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) { | |
%36 = bufferization.to_tensor %arg7 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
%37 = memref.subview %24[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%38 = bufferization.to_tensor %37 : memref<128xi32, strided<[1], offset: ?>> | |
%39 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%40 = bufferization.to_tensor %39 : memref<i32, strided<[], offset: ?>> | |
%41 = memref.expand_shape %37 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%42 = bufferization.to_tensor %41 : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%43 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : memref<1x1x4xi32>) outs(%43 : memref<1x1x4xi32>) { | |
^bb0(%arg8: i32, %arg9: i32): | |
linalg.yield %arg8 : i32 | |
} | |
%44 = bufferization.to_tensor %43 : memref<1x1x4xi32> | |
%45 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %43) -> (memref<1x1x4xi32>) { | |
%57 = bufferization.to_tensor %arg9 : memref<1x1x4xi32> | |
%58 = memref.collapse_shape %arg9 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%59 = bufferization.to_tensor %58 : memref<4xi32> | |
%60 = vector.transfer_read %41[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%61 = vector.transfer_read %58[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%62 = arith.addi %60, %61 : vector<4xi32> | |
vector.transfer_write %62, %58[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%63 = bufferization.to_tensor %58 : memref<4xi32> | |
%64 = memref.expand_shape %58 [[0, 1, 2]] : memref<4xi32> into memref<1x1x4xi32> | |
%65 = bufferization.to_tensor %64 : memref<1x1x4xi32> | |
scf.yield %64 : memref<1x1x4xi32> | |
} | |
%46 = bufferization.to_tensor %45 : memref<1x1x4xi32> | |
%47 = memref.collapse_shape %45 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%48 = bufferization.to_tensor %47 : memref<4xi32> | |
%49 = vector.transfer_read %47[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%50 = vector.transfer_read %39[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%51 = vector.extractelement %50[] : vector<i32> | |
%52 = vector.multi_reduction <add>, %49, %51 [0] : vector<4xi32> to i32 | |
%53 = vector.broadcast %52 : i32 to vector<i32> | |
vector.transfer_write %53, %39[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
%54 = bufferization.to_tensor %39 : memref<i32, strided<[], offset: ?>> | |
%55 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
linalg.generic {indexing_maps = [#map2, #map2], iterator_types = []} ins(%39 : memref<i32, strided<[], offset: ?>>) outs(%55 : memref<i32, strided<[], offset: ?>>) { | |
^bb0(%arg8: i32, %arg9: i32): | |
linalg.yield %arg8 : i32 | |
} | |
%56 = bufferization.to_tensor %arg7 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.yield %arg7 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
} | |
%31 = bufferization.to_tensor %30 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
%32 = memref.collapse_shape %30 [[0, 1]] : memref<1x4xi32, strided<[4, 1], offset: ?>> into memref<4xi32, strided<[1], offset: ?>> | |
%33 = bufferization.to_tensor %32 : memref<4xi32, strided<[1], offset: ?>> | |
%34 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%32 : memref<4xi32, strided<[1], offset: ?>>) outs(%34 : memref<4xi32, strided<[1], offset: ?>>) { | |
^bb0(%arg6: i32, %arg7: i32): | |
linalg.yield %arg6 : i32 | |
} | |
%35 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
} | |
%20 = bufferization.to_tensor %19 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
scf.yield %19 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
} | |
%16 = bufferization.to_tensor %15 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%17 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%17 : memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
linalg.yield %arg2 : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<() -> ()> | |
#map3 = affine_map<(d0) -> (d0)> | |
#map4 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %2, 64 : memref<512x256xi32> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%4 = affine.apply #map0()[%workgroup_id_y] | |
%5 = affine.apply #map0()[%workgroup_count_y] | |
%6 = affine.apply #map0()[%workgroup_id_x] | |
%7 = affine.apply #map0()[%workgroup_count_x] | |
%8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
scf.for %arg0 = %4 to %c512 step %5 { | |
scf.for %arg1 = %6 to %c256 step %7 { | |
%9 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
%11 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %9) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
%13 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
%14 = memref.subview %10[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%16 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%17 = memref.expand_shape %16 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
%18 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %17) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) { | |
%21 = memref.subview %15[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%22 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%23 = memref.expand_shape %21 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%24 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : memref<1x1x4xi32>) outs(%24 : memref<1x1x4xi32>) { | |
^bb0(%arg8: i32, %arg9: i32): | |
linalg.yield %arg8 : i32 | |
} | |
%25 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %24) -> (memref<1x1x4xi32>) { | |
%33 = memref.collapse_shape %arg9 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%34 = vector.transfer_read %23[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%35 = vector.transfer_read %33[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%36 = arith.addi %34, %35 : vector<4xi32> | |
vector.transfer_write %36, %33[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
scf.yield %arg9 : memref<1x1x4xi32> | |
} | |
%26 = memref.collapse_shape %25 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%27 = vector.transfer_read %26[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%28 = vector.transfer_read %22[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%29 = vector.extractelement %28[] : vector<i32> | |
%30 = vector.multi_reduction <add>, %27, %29 [0] : vector<4xi32> to i32 | |
%31 = vector.broadcast %30 : i32 to vector<i32> | |
vector.transfer_write %31, %22[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
%32 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
linalg.generic {indexing_maps = [#map2, #map2], iterator_types = []} ins(%22 : memref<i32, strided<[], offset: ?>>) outs(%32 : memref<i32, strided<[], offset: ?>>) { | |
^bb0(%arg8: i32, %arg9: i32): | |
linalg.yield %arg8 : i32 | |
} | |
scf.yield %arg7 : memref<1x4xi32, strided<[4, 1], offset: ?>> | |
} | |
%19 = memref.collapse_shape %18 [[0, 1]] : memref<1x4xi32, strided<[4, 1], offset: ?>> into memref<4xi32, strided<[1], offset: ?>> | |
%20 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%19 : memref<4xi32, strided<[1], offset: ?>>) outs(%20 : memref<4xi32, strided<[1], offset: ?>>) { | |
^bb0(%arg6: i32, %arg7: i32): | |
linalg.yield %arg6 : i32 | |
} | |
scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
} | |
scf.yield %13 : memref<32x32xi32, strided<[256, 1], offset: ?>> | |
} | |
%12 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%12 : memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
linalg.yield %arg2 : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<() -> ()> | |
#map3 = affine_map<(d0) -> (d0)> | |
#map4 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %2, 64 : memref<512x256xi32> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%4 = affine.apply #map0()[%workgroup_id_y] | |
%5 = affine.apply #map0()[%workgroup_count_y] | |
%6 = affine.apply #map0()[%workgroup_id_x] | |
%7 = affine.apply #map0()[%workgroup_count_x] | |
%8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
scf.for %arg0 = %4 to %c512 step %5 { | |
scf.for %arg1 = %6 to %c256 step %7 { | |
%9 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c32 step %c1 { | |
scf.for %arg3 = %c0 to %c32 step %c4 { | |
%12 = memref.subview %10[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%13 = memref.expand_shape %12 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%14 = memref.subview %9[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%15 = memref.expand_shape %14 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%17 = memref.subview %13[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%18 = memref.subview %15[0, %arg4] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%20 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : memref<1x1x4xi32>) outs(%20 : memref<1x1x4xi32>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
scf.for %arg5 = %c0 to %c32 step %c1 { | |
%28 = memref.collapse_shape %20 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%29 = vector.transfer_read %19[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%30 = vector.transfer_read %28[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%31 = arith.addi %29, %30 : vector<4xi32> | |
vector.transfer_write %31, %28[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
} | |
%21 = memref.collapse_shape %20 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%22 = vector.transfer_read %21[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%23 = vector.transfer_read %18[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%24 = vector.extractelement %23[] : vector<i32> | |
%25 = vector.multi_reduction <add>, %22, %24 [0] : vector<4xi32> to i32 | |
%26 = vector.broadcast %25 : i32 to vector<i32> | |
vector.transfer_write %26, %18[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
%27 = memref.subview %15[0, %arg4] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
linalg.generic {indexing_maps = [#map2, #map2], iterator_types = []} ins(%18 : memref<i32, strided<[], offset: ?>>) outs(%27 : memref<i32, strided<[], offset: ?>>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
} | |
%16 = memref.subview %9[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%14 : memref<4xi32, strided<[1], offset: ?>>) outs(%16 : memref<4xi32, strided<[1], offset: ?>>) { | |
^bb0(%arg4: i32, %arg5: i32): | |
linalg.yield %arg4 : i32 | |
} | |
} | |
} | |
%11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
linalg.yield %arg2 : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<() -> ()> | |
#map3 = affine_map<(d0) -> (d0)> | |
#map4 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %2, 64 : memref<512x256xi32> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%4 = affine.apply #map0()[%workgroup_id_y] | |
%5 = affine.apply #map0()[%workgroup_count_y] | |
%6 = affine.apply #map0()[%workgroup_id_x] | |
%7 = affine.apply #map0()[%workgroup_count_x] | |
%8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
scf.for %arg0 = %4 to %c512 step %5 { | |
scf.for %arg1 = %6 to %c256 step %7 { | |
%9 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c32 step %c1 { | |
scf.for %arg3 = %c0 to %c32 step %c4 { | |
%11 = memref.subview %10[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%13 = memref.subview %9[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%15 = memref.subview %12[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%16 = memref.subview %14[0, %arg4] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%17 = memref.expand_shape %15 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%18 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : memref<1x1x4xi32>) outs(%18 : memref<1x1x4xi32>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
scf.for %arg5 = %c0 to %c32 step %c1 { | |
%25 = memref.collapse_shape %18 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%26 = vector.transfer_read %17[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%27 = vector.transfer_read %25[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%28 = arith.addi %26, %27 : vector<4xi32> | |
vector.transfer_write %28, %25[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
} | |
%19 = memref.collapse_shape %18 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%20 = vector.transfer_read %19[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%21 = vector.transfer_read %16[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%22 = vector.extractelement %21[] : vector<i32> | |
%23 = vector.multi_reduction <add>, %20, %22 [0] : vector<4xi32> to i32 | |
%24 = vector.broadcast %23 : i32 to vector<i32> | |
vector.transfer_write %24, %16[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
linalg.generic {indexing_maps = [#map2, #map2], iterator_types = []} ins(%16 : memref<i32, strided<[], offset: ?>>) outs(%16 : memref<i32, strided<[], offset: ?>>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
} | |
linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%13 : memref<4xi32, strided<[1], offset: ?>>) outs(%13 : memref<4xi32, strided<[1], offset: ?>>) { | |
^bb0(%arg4: i32, %arg5: i32): | |
linalg.yield %arg4 : i32 | |
} | |
} | |
} | |
linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%9 : memref<32x32xi32, strided<[256, 1], offset: ?>>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
linalg.yield %arg2 : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %2, 64 : memref<512x256xi32> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%4 = affine.apply #map0()[%workgroup_id_y] | |
%5 = affine.apply #map0()[%workgroup_count_y] | |
%6 = affine.apply #map0()[%workgroup_id_x] | |
%7 = affine.apply #map0()[%workgroup_count_x] | |
%8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
scf.for %arg0 = %4 to %c512 step %5 { | |
scf.for %arg1 = %6 to %c256 step %7 { | |
%9 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c32 step %c1 { | |
scf.for %arg3 = %c0 to %c32 step %c4 { | |
%11 = memref.subview %10[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%13 = memref.subview %9[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%15 = memref.subview %12[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%16 = memref.subview %14[0, %arg4] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%17 = memref.expand_shape %15 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%18 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : memref<1x1x4xi32>) outs(%18 : memref<1x1x4xi32>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
scf.for %arg5 = %c0 to %c32 step %c1 { | |
%25 = memref.collapse_shape %18 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%26 = vector.transfer_read %17[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%27 = vector.transfer_read %25[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%28 = arith.addi %26, %27 : vector<4xi32> | |
vector.transfer_write %28, %25[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
} | |
%19 = memref.collapse_shape %18 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%20 = vector.transfer_read %19[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%21 = vector.transfer_read %16[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%22 = vector.extractelement %21[] : vector<i32> | |
%23 = vector.multi_reduction <add>, %20, %22 [0] : vector<4xi32> to i32 | |
%24 = vector.broadcast %23 : i32 to vector<i32> | |
vector.transfer_write %24, %16[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_count_y] | |
%4 = affine.apply #map0()[%workgroup_id_x] | |
%5 = affine.apply #map0()[%workgroup_count_x] | |
%6 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
scf.for %arg0 = %2 to %c512 step %3 { | |
scf.for %arg1 = %4 to %c256 step %5 { | |
%7 = memref.subview %1[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%8 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c32 step %c1 { | |
scf.for %arg3 = %c0 to %c32 step %c4 { | |
%9 = memref.subview %8[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%11 = memref.subview %7[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%12 = memref.expand_shape %11 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%13 = memref.subview %10[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%14 = memref.subview %12[0, %arg4] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%15 = memref.expand_shape %13 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%16 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6 : memref<1x1x4xi32>) outs(%16 : memref<1x1x4xi32>) { | |
^bb0(%arg5: i32, %arg6: i32): | |
linalg.yield %arg5 : i32 | |
} | |
scf.for %arg5 = %c0 to %c32 step %c1 { | |
%23 = memref.collapse_shape %16 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%24 = vector.transfer_read %15[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%25 = vector.transfer_read %23[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%26 = arith.addi %24, %25 : vector<4xi32> | |
vector.transfer_write %26, %23[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
} | |
%17 = memref.collapse_shape %16 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%18 = vector.transfer_read %17[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %14[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.multi_reduction <add>, %18, %20 [0] : vector<4xi32> to i32 | |
%22 = vector.broadcast %21 : i32 to vector<i32> | |
vector.transfer_write %22, %14[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
scf.for %arg3 = %c0 to %c32 step %c1 { | |
%21 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%22 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%23 = vector.transfer_read %21[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%24 = arith.addi %22, %23 : vector<4xi32> | |
vector.transfer_write %24, %21[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%18 = vector.extractelement %17[] : vector<i32> | |
%19 = vector.multi_reduction <add>, %16, %18 [0] : vector<4xi32> to i32 | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
vector.transfer_write %20, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%23 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%24 = arith.addi %23, %arg4 : vector<4xi32> | |
scf.yield %24 : vector<4xi32> | |
} | |
vector.transfer_write %17, %15[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%18 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.multi_reduction <add>, %18, %20 [0] : vector<4xi32> to i32 | |
%22 = vector.broadcast %21 : i32 to vector<i32> | |
vector.transfer_write %22, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%23 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%24 = arith.addi %23, %arg4 : vector<4xi32> | |
scf.yield %24 : vector<4xi32> | |
} | |
vector.transfer_write %17, %15[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%18 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.multi_reduction <add>, %18, %20 [0] : vector<4xi32> to i32 | |
%22 = vector.broadcast %21 : i32 to vector<i32> | |
vector.transfer_write %22, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%23 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%24 = arith.addi %23, %arg4 : vector<4xi32> | |
scf.yield %24 : vector<4xi32> | |
} | |
vector.transfer_write %17, %15[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%18 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.multi_reduction <add>, %18, %20 [0] : vector<4xi32> to i32 | |
%22 = vector.broadcast %21 : i32 to vector<i32> | |
vector.transfer_write %22, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%25 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%26 = arith.addi %25, %arg4 : vector<4xi32> | |
scf.yield %26 : vector<4xi32> | |
} | |
vector.transfer_write %17, %15[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%18 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.reduction <add>, %18, %20 : vector<4xi32> into i32 | |
%22 = vector.insertelement %21, %cst[%c0 : index] : vector<1xi32> | |
%23 = vector.extract %22[0] : vector<1xi32> | |
%24 = vector.broadcast %23 : i32 to vector<i32> | |
vector.transfer_write %24, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%25 = vector.transfer_read %13[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%26 = arith.addi %25, %arg4 : vector<4xi32> | |
scf.yield %26 : vector<4xi32> | |
} | |
vector.transfer_write %17, %15[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32> | |
%18 = vector.transfer_read %15[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32> | |
%19 = vector.transfer_read %12[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32> | |
%20 = vector.extractelement %19[] : vector<i32> | |
%21 = vector.reduction <add>, %18, %20 : vector<4xi32> into i32 | |
%22 = vector.insertelement %21, %cst[%c0 : index] : vector<1xi32> | |
%23 = vector.extract %22[0] : vector<1xi32> | |
%24 = vector.broadcast %23 : i32 to vector<i32> | |
vector.transfer_write %24, %12[] : vector<i32>, memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
%5 = vector.extract %cst_0[0, 0] : vector<1x1x4xi32> | |
vector.store %5, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%6 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%7 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%8 = memref.subview %7[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%9 = memref.expand_shape %8 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%10 = memref.subview %6[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%11 = memref.expand_shape %10 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%12 = memref.subview %9[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%13 = memref.subview %11[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%14 = memref.expand_shape %12 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%15 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%15 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%16 = memref.collapse_shape %15 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%17 = vector.load %16[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %17) -> (vector<4xi32>) { | |
%28 = vector.load %14[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%29 = arith.addi %28, %arg4 : vector<4xi32> | |
scf.yield %29 : vector<4xi32> | |
} | |
vector.store %18, %16[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = vector.load %16[%c0] : memref<4xi32>, vector<4xi32> | |
%20 = memref.load %13[] : memref<i32, strided<[], offset: ?>> | |
%21 = vector.broadcast %20 : i32 to vector<i32> | |
%22 = vector.extractelement %21[] : vector<i32> | |
%23 = vector.reduction <add>, %19, %22 : vector<4xi32> into i32 | |
%24 = vector.insertelement %23, %cst[%c0 : index] : vector<1xi32> | |
%25 = vector.extract %24[0] : vector<1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<i32> | |
%27 = vector.extractelement %26[] : vector<i32> | |
memref.store %27, %13[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<0> : vector<1xi32> | |
%cst_0 = arith.constant dense<0> : vector<1x1x4xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
%5 = vector.extract %cst_0[0, 0] : vector<1x1x4xi32> | |
vector.store %5, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%6 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%7 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%8 = memref.subview %7[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%9 = memref.expand_shape %8 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%10 = memref.subview %6[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%11 = memref.expand_shape %10 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%12 = memref.subview %9[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%13 = memref.subview %11[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%14 = memref.expand_shape %12 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%15 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%15 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%16 = memref.collapse_shape %15 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%17 = vector.load %16[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %17) -> (vector<4xi32>) { | |
%28 = vector.load %14[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%29 = arith.addi %28, %arg4 : vector<4xi32> | |
scf.yield %29 : vector<4xi32> | |
} | |
vector.store %18, %16[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = vector.load %16[%c0] : memref<4xi32>, vector<4xi32> | |
%20 = memref.load %13[] : memref<i32, strided<[], offset: ?>> | |
%21 = vector.broadcast %20 : i32 to vector<i32> | |
%22 = vector.extractelement %21[] : vector<i32> | |
%23 = vector.reduction <add>, %19, %22 : vector<4xi32> into i32 | |
%24 = vector.insertelement %23, %cst[%c0 : index] : vector<1xi32> | |
%25 = vector.extract %24[0] : vector<1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<i32> | |
%27 = vector.extractelement %26[] : vector<i32> | |
memref.store %27, %13[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst_0[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LLVMCPULowerExecutableTarget (iree-llvmcpu-lower-executable-target) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map0 = affine_map<()[s0] -> (s0 * 32)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map0()[%workgroup_id_y] | |
%3 = affine.apply #map0()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x4xi32>) outs(%14 : memref<1x1x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
linalg.yield %arg3 : i32 | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst_0[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LinalgLowerToLoops (convert-linalg-to-loops) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
scf.for %arg3 = %c0 to %c1 step %c1 { | |
scf.for %arg4 = %c0 to %c1 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%27 = memref.load %4[%arg3, %arg4, %arg5] : memref<1x1x4xi32> | |
memref.store %27, %14[%arg3, %arg4, %arg5] : memref<1x1x4xi32> | |
} | |
} | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst_0[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c32 step %c1 { | |
scf.for %arg1 = %c0 to %c32 step %c4 { | |
%7 = memref.subview %6[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%8 = memref.expand_shape %7 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%9 = memref.subview %5[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%10 = memref.expand_shape %9 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
%11 = memref.subview %8[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%12 = memref.subview %10[0, %arg2] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%13 = memref.expand_shape %11 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%14 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%27 = memref.load %4[%c0, %c0, %arg3] : memref<1x1x4xi32> | |
memref.store %27, %14[%c0, %c0, %arg3] : memref<1x1x4xi32> | |
} | |
%15 = memref.collapse_shape %14 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%16 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%17 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %16) -> (vector<4xi32>) { | |
%27 = vector.load %13[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%28 = arith.addi %27, %arg4 : vector<4xi32> | |
scf.yield %28 : vector<4xi32> | |
} | |
vector.store %17, %15[%c0] : memref<4xi32>, vector<4xi32> | |
%18 = vector.load %15[%c0] : memref<4xi32>, vector<4xi32> | |
%19 = memref.load %12[] : memref<i32, strided<[], offset: ?>> | |
%20 = vector.broadcast %19 : i32 to vector<i32> | |
%21 = vector.extractelement %20[] : vector<i32> | |
%22 = vector.reduction <add>, %18, %21 : vector<4xi32> into i32 | |
%23 = vector.insertelement %22, %cst_0[%c0 : index] : vector<1xi32> | |
%24 = vector.extract %23[0] : vector<1xi32> | |
%25 = vector.broadcast %24 : i32 to vector<i32> | |
%26 = vector.extractelement %25[] : vector<i32> | |
memref.store %26, %12[] : memref<i32, strided<[], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
cf.br ^bb1(%c0 : index) | |
^bb1(%7: index): // 2 preds: ^bb0, ^bb14 | |
%8 = arith.cmpi slt, %7, %c32 : index | |
cf.cond_br %8, ^bb2, ^bb15 | |
^bb2: // pred: ^bb1 | |
cf.br ^bb3(%c0 : index) | |
^bb3(%9: index): // 2 preds: ^bb2, ^bb13 | |
%10 = arith.cmpi slt, %9, %c32 : index | |
cf.cond_br %10, ^bb4, ^bb14 | |
^bb4: // pred: ^bb3 | |
%11 = memref.subview %6[%7, %9, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%13 = memref.subview %5[%7, %9] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
cf.br ^bb5(%c0 : index) | |
^bb5(%15: index): // 2 preds: ^bb4, ^bb12 | |
%16 = arith.cmpi slt, %15, %c4 : index | |
cf.cond_br %16, ^bb6, ^bb13 | |
^bb6: // pred: ^bb5 | |
%17 = memref.subview %12[0, %15, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%18 = memref.subview %14[0, %15] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%20 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
cf.br ^bb7(%c0 : index) | |
^bb7(%21: index): // 2 preds: ^bb6, ^bb8 | |
%22 = arith.cmpi slt, %21, %c4 : index | |
cf.cond_br %22, ^bb8, ^bb9 | |
^bb8: // pred: ^bb7 | |
%23 = memref.load %4[%c0, %c0, %21] : memref<1x1x4xi32> | |
memref.store %23, %20[%c0, %c0, %21] : memref<1x1x4xi32> | |
%24 = arith.addi %21, %c1 : index | |
cf.br ^bb7(%24 : index) | |
^bb9: // pred: ^bb7 | |
%25 = memref.collapse_shape %20 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%26 = vector.load %25[%c0] : memref<4xi32>, vector<4xi32> | |
cf.br ^bb10(%c0, %26 : index, vector<4xi32>) | |
^bb10(%27: index, %28: vector<4xi32>): // 2 preds: ^bb9, ^bb11 | |
%29 = arith.cmpi slt, %27, %c32 : index | |
cf.cond_br %29, ^bb11, ^bb12 | |
^bb11: // pred: ^bb10 | |
%30 = vector.load %19[%c0, %c0, %27, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%31 = arith.addi %30, %28 : vector<4xi32> | |
%32 = arith.addi %27, %c1 : index | |
cf.br ^bb10(%32, %31 : index, vector<4xi32>) | |
^bb12: // pred: ^bb10 | |
vector.store %28, %25[%c0] : memref<4xi32>, vector<4xi32> | |
%33 = vector.load %25[%c0] : memref<4xi32>, vector<4xi32> | |
%34 = memref.load %18[] : memref<i32, strided<[], offset: ?>> | |
%35 = vector.broadcast %34 : i32 to vector<i32> | |
%36 = vector.extractelement %35[] : vector<i32> | |
%37 = vector.reduction <add>, %33, %36 : vector<4xi32> into i32 | |
%38 = vector.insertelement %37, %cst_0[%c0 : index] : vector<1xi32> | |
%39 = vector.extract %38[0] : vector<1xi32> | |
%40 = vector.broadcast %39 : i32 to vector<i32> | |
%41 = vector.extractelement %40[] : vector<i32> | |
memref.store %41, %18[] : memref<i32, strided<[], offset: ?>> | |
%42 = arith.addi %15, %c1 : index | |
cf.br ^bb5(%42 : index) | |
^bb13: // pred: ^bb5 | |
%43 = arith.addi %9, %c4 : index | |
cf.br ^bb3(%43 : index) | |
^bb14: // pred: ^bb3 | |
%44 = arith.addi %7, %c1 : index | |
cf.br ^bb1(%44 : index) | |
^bb15: // pred: ^bb1 | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<()[s0] -> (s0 * 32)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() { | |
%cst = arith.constant dense<0> : vector<4xi32> | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<0> : vector<1xi32> | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32> | |
memref.assume_alignment %0, 64 : memref<512x256x128xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32> | |
memref.assume_alignment %1, 64 : memref<512x256xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply #map()[%workgroup_id_y] | |
%3 = affine.apply #map()[%workgroup_id_x] | |
%4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32> | |
%5 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>> | |
%6 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> | |
cf.br ^bb1(%c0 : index) | |
^bb1(%7: index): // 2 preds: ^bb0, ^bb13 | |
%8 = arith.cmpi slt, %7, %c32 : index | |
cf.cond_br %8, ^bb2(%c0 : index), ^bb14 | |
^bb2(%9: index): // 2 preds: ^bb1, ^bb12 | |
%10 = arith.cmpi slt, %9, %c32 : index | |
cf.cond_br %10, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%11 = memref.subview %6[%7, %9, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>> | |
%12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> | |
%13 = memref.subview %5[%7, %9] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>> | |
%14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>> | |
cf.br ^bb4(%c0 : index) | |
^bb4(%15: index): // 2 preds: ^bb3, ^bb11 | |
%16 = arith.cmpi slt, %15, %c4 : index | |
cf.cond_br %16, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%17 = memref.subview %12[0, %15, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<128xi32, strided<[1], offset: ?>> | |
%18 = memref.subview %14[0, %15] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>> | |
%19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<128xi32, strided<[1], offset: ?>> into memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>> | |
%20 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32> | |
cf.br ^bb6(%c0 : index) | |
^bb6(%21: index): // 2 preds: ^bb5, ^bb7 | |
%22 = arith.cmpi slt, %21, %c4 : index | |
cf.cond_br %22, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%23 = memref.load %4[%c0, %c0, %21] : memref<1x1x4xi32> | |
memref.store %23, %20[%c0, %c0, %21] : memref<1x1x4xi32> | |
%24 = arith.addi %21, %c1 : index | |
cf.br ^bb6(%24 : index) | |
^bb8: // pred: ^bb6 | |
%25 = memref.collapse_shape %20 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32> | |
%26 = vector.load %25[%c0] : memref<4xi32>, vector<4xi32> | |
cf.br ^bb9(%c0, %26 : index, vector<4xi32>) | |
^bb9(%27: index, %28: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%29 = arith.cmpi slt, %27, %c32 : index | |
cf.cond_br %29, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%30 = vector.load %19[%c0, %c0, %27, %c0] : memref<1x1x32x4xi32, strided<[128, 128, 4, 1], offset: ?>>, vector<4xi32> | |
%31 = arith.addi %30, %28 : vector<4xi32> | |
%32 = arith.addi %27, %c1 : index | |
cf.br ^bb9(%32, %31 : index, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
vector.store %28, %25[%c0] : memref<4xi32>, vector<4xi32> | |
%33 = vector.load %25[%c0] : memref<4xi32>, vector<4xi32> | |
%34 = memref.load %18[] : memref<i32, strided<[], offset: ?>> | |
%35 = vector.broadcast %34 : i32 to vector<i32> | |
%36 = vector.extractelement %35[] : vector<i32> | |
%37 = vector.reduction <add>, %33, %36 : vector<4xi32> into i32 | |
%38 = vector.insertelement %37, %cst_0[%c0 : index] : vector<1xi32> | |
%39 = vector.extract %38[0] : vector<1xi32> | |
%40 = vector.broadcast %39 : i32 to vector<i32> | |
%41 = vector.extractelement %40[] : vector<i32> | |
memref.store %41, %18[] : memref<i32, strided<[], offset: ?>> | |
%42 = arith.addi %15, %c1 : index | |
cf.br ^bb4(%42 : index) | |
^bb12: // pred: ^bb4 | |
%43 = arith.addi %9, %c4 : index | |
cf.br ^bb2(%43 : index) | |
^bb13: // pred: ^bb2 | |
%44 = arith.addi %7, %c1 : index | |
cf.br ^bb1(%44 : index) | |
^bb14: // pred: ^bb1 | |
return | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ConvertToLLVM (iree-convert-to-llvm) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 { | |
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%1 = llvm.mlir.constant(0 : index) : i64 | |
%2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%3 = llvm.mlir.constant(4 : index) : i64 | |
%4 = llvm.mlir.constant(1 : index) : i64 | |
%5 = llvm.mlir.constant(32 : index) : i64 | |
%6 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%7 = llvm.extractvalue %6[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%8 = llvm.mlir.constant(0 : i64) : i64 | |
%9 = llvm.load %7 : !llvm.ptr<ptr<i8>> | |
%10 = llvm.bitcast %9 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%11 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%12 = llvm.insertvalue %10, %11[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%13 = llvm.insertvalue %10, %12[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%14 = llvm.mlir.constant(0 : index) : i64 | |
%15 = llvm.insertvalue %14, %13[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%16 = llvm.mlir.constant(512 : index) : i64 | |
%17 = llvm.insertvalue %16, %15[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%18 = llvm.mlir.constant(32768 : index) : i64 | |
%19 = llvm.insertvalue %18, %17[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%20 = llvm.mlir.constant(256 : index) : i64 | |
%21 = llvm.insertvalue %20, %19[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%22 = llvm.mlir.constant(128 : index) : i64 | |
%23 = llvm.insertvalue %22, %21[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%24 = llvm.mlir.constant(128 : index) : i64 | |
%25 = llvm.insertvalue %24, %23[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%26 = llvm.mlir.constant(1 : index) : i64 | |
%27 = llvm.insertvalue %26, %25[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%28 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%29 = llvm.mlir.constant(0 : index) : i64 | |
%30 = llvm.mlir.constant(63 : index) : i64 | |
%31 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%32 = llvm.and %31, %30 : i64 | |
%33 = llvm.icmp "eq" %32, %29 : i64 | |
"llvm.intr.assume"(%33) : (i1) -> () | |
%34 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%35 = llvm.extractvalue %34[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%36 = llvm.mlir.constant(1 : i64) : i64 | |
%37 = llvm.getelementptr %35[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%38 = llvm.load %37 : !llvm.ptr<ptr<i8>> | |
%39 = llvm.bitcast %38 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%40 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%41 = llvm.insertvalue %39, %40[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%42 = llvm.insertvalue %39, %41[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%43 = llvm.mlir.constant(0 : index) : i64 | |
%44 = llvm.insertvalue %43, %42[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%45 = llvm.mlir.constant(512 : index) : i64 | |
%46 = llvm.insertvalue %45, %44[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%47 = llvm.mlir.constant(256 : index) : i64 | |
%48 = llvm.insertvalue %47, %46[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%49 = llvm.mlir.constant(256 : index) : i64 | |
%50 = llvm.insertvalue %49, %48[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%51 = llvm.mlir.constant(1 : index) : i64 | |
%52 = llvm.insertvalue %51, %50[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%53 = llvm.extractvalue %52[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%54 = llvm.mlir.constant(0 : index) : i64 | |
%55 = llvm.mlir.constant(63 : index) : i64 | |
%56 = llvm.ptrtoint %53 : !llvm.ptr<i32> to i64 | |
%57 = llvm.and %56, %55 : i64 | |
%58 = llvm.icmp "eq" %57, %54 : i64 | |
"llvm.intr.assume"(%58) : (i1) -> () | |
%59 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%60 = llvm.extractvalue %59[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%61 = llvm.zext %60 : i32 to i64 | |
%62 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%63 = llvm.extractvalue %62[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%64 = llvm.zext %63 : i32 to i64 | |
%65 = llvm.mlir.constant(32 : index) : i64 | |
%66 = llvm.mul %64, %65 : i64 | |
%67 = llvm.mlir.constant(32 : index) : i64 | |
%68 = llvm.mul %61, %67 : i64 | |
%69 = llvm.mlir.constant(1 : index) : i64 | |
%70 = llvm.mlir.constant(1 : index) : i64 | |
%71 = llvm.mlir.constant(4 : index) : i64 | |
%72 = llvm.mlir.constant(1 : index) : i64 | |
%73 = llvm.mlir.constant(4 : index) : i64 | |
%74 = llvm.mlir.constant(4 : index) : i64 | |
%75 = llvm.mlir.null : !llvm.ptr<i32> | |
%76 = llvm.getelementptr %75[%74] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%77 = llvm.ptrtoint %76 : !llvm.ptr<i32> to i64 | |
%78 = llvm.alloca %77 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%79 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%80 = llvm.insertvalue %78, %79[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%81 = llvm.insertvalue %78, %80[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%82 = llvm.mlir.constant(0 : index) : i64 | |
%83 = llvm.insertvalue %82, %81[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%84 = llvm.insertvalue %69, %83[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%85 = llvm.insertvalue %70, %84[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%86 = llvm.insertvalue %71, %85[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%87 = llvm.insertvalue %73, %86[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%88 = llvm.insertvalue %71, %87[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%89 = llvm.insertvalue %72, %88[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%90 = llvm.extractvalue %89[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%91 = llvm.mlir.constant(4 : index) : i64 | |
%92 = llvm.mul %1, %91 : i64 | |
%93 = llvm.mlir.constant(4 : index) : i64 | |
%94 = llvm.mul %1, %93 : i64 | |
%95 = llvm.add %92, %94 : i64 | |
%96 = llvm.add %95, %1 : i64 | |
%97 = llvm.getelementptr %90[%96] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%98 = llvm.bitcast %97 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %0, %98 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%99 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%100 = llvm.extractvalue %52[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%101 = llvm.bitcast %100 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%102 = llvm.insertvalue %101, %99[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%103 = llvm.extractvalue %52[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%104 = llvm.bitcast %103 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%105 = llvm.insertvalue %104, %102[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%106 = llvm.extractvalue %52[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%107 = llvm.extractvalue %52[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%108 = llvm.extractvalue %52[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%109 = llvm.mul %66, %106 : i64 | |
%110 = llvm.add %108, %109 : i64 | |
%111 = llvm.mul %68, %107 : i64 | |
%112 = llvm.add %110, %111 : i64 | |
%113 = llvm.insertvalue %112, %105[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%114 = llvm.mlir.constant(32 : i64) : i64 | |
%115 = llvm.mlir.constant(1 : i64) : i64 | |
%116 = llvm.insertvalue %114, %113[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%117 = llvm.insertvalue %115, %116[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%118 = llvm.mlir.constant(32 : i64) : i64 | |
%119 = llvm.mlir.constant(256 : i64) : i64 | |
%120 = llvm.insertvalue %118, %117[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%121 = llvm.insertvalue %119, %120[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%122 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%123 = llvm.extractvalue %27[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%124 = llvm.bitcast %123 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%125 = llvm.insertvalue %124, %122[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%126 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%127 = llvm.bitcast %126 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%128 = llvm.insertvalue %127, %125[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%129 = llvm.extractvalue %27[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%130 = llvm.extractvalue %27[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%131 = llvm.extractvalue %27[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%132 = llvm.extractvalue %27[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%133 = llvm.mul %66, %129 : i64 | |
%134 = llvm.add %132, %133 : i64 | |
%135 = llvm.mul %68, %130 : i64 | |
%136 = llvm.add %134, %135 : i64 | |
%137 = llvm.mlir.constant(0 : i64) : i64 | |
%138 = llvm.mul %137, %131 : i64 | |
%139 = llvm.add %136, %138 : i64 | |
%140 = llvm.insertvalue %139, %128[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%141 = llvm.mlir.constant(128 : i64) : i64 | |
%142 = llvm.mlir.constant(1 : i64) : i64 | |
%143 = llvm.insertvalue %141, %140[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%144 = llvm.insertvalue %142, %143[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%145 = llvm.mlir.constant(32 : i64) : i64 | |
%146 = llvm.mlir.constant(128 : i64) : i64 | |
%147 = llvm.insertvalue %145, %144[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%148 = llvm.insertvalue %146, %147[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%149 = llvm.mlir.constant(32 : i64) : i64 | |
%150 = llvm.mlir.constant(32768 : i64) : i64 | |
%151 = llvm.insertvalue %149, %148[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%152 = llvm.insertvalue %150, %151[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb1(%1 : i64) | |
^bb1(%153: i64): // 2 preds: ^bb0, ^bb13 | |
%154 = llvm.icmp "slt" %153, %5 : i64 | |
llvm.cond_br %154, ^bb2(%1 : i64), ^bb14 | |
^bb2(%155: i64): // 2 preds: ^bb1, ^bb12 | |
%156 = llvm.icmp "slt" %155, %5 : i64 | |
llvm.cond_br %156, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%157 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%158 = llvm.extractvalue %152[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%159 = llvm.bitcast %158 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%160 = llvm.insertvalue %159, %157[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%161 = llvm.extractvalue %152[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%162 = llvm.bitcast %161 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%163 = llvm.insertvalue %162, %160[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%164 = llvm.extractvalue %152[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%165 = llvm.extractvalue %152[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%166 = llvm.extractvalue %152[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%167 = llvm.extractvalue %152[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%168 = llvm.mul %153, %164 : i64 | |
%169 = llvm.add %167, %168 : i64 | |
%170 = llvm.mul %155, %165 : i64 | |
%171 = llvm.add %169, %170 : i64 | |
%172 = llvm.mlir.constant(0 : i64) : i64 | |
%173 = llvm.mul %172, %166 : i64 | |
%174 = llvm.add %171, %173 : i64 | |
%175 = llvm.insertvalue %174, %163[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%176 = llvm.mlir.constant(128 : i64) : i64 | |
%177 = llvm.mlir.constant(1 : i64) : i64 | |
%178 = llvm.insertvalue %176, %175[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%179 = llvm.insertvalue %177, %178[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%180 = llvm.mlir.constant(4 : i64) : i64 | |
%181 = llvm.mlir.constant(128 : i64) : i64 | |
%182 = llvm.insertvalue %180, %179[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%183 = llvm.insertvalue %181, %182[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%184 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%185 = llvm.extractvalue %183[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%186 = llvm.insertvalue %185, %184[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%187 = llvm.extractvalue %183[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%188 = llvm.insertvalue %187, %186[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%189 = llvm.extractvalue %183[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%190 = llvm.insertvalue %189, %188[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%191 = llvm.mlir.constant(1 : index) : i64 | |
%192 = llvm.mlir.constant(4 : index) : i64 | |
%193 = llvm.mlir.constant(128 : index) : i64 | |
%194 = llvm.insertvalue %191, %190[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%195 = llvm.insertvalue %192, %194[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%196 = llvm.insertvalue %193, %195[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%197 = llvm.mlir.constant(512 : index) : i64 | |
%198 = llvm.insertvalue %197, %196[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%199 = llvm.mlir.constant(128 : index) : i64 | |
%200 = llvm.insertvalue %199, %198[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%201 = llvm.mlir.constant(1 : index) : i64 | |
%202 = llvm.insertvalue %201, %200[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%203 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%204 = llvm.extractvalue %121[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%205 = llvm.bitcast %204 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%206 = llvm.insertvalue %205, %203[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%207 = llvm.extractvalue %121[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%208 = llvm.bitcast %207 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%209 = llvm.insertvalue %208, %206[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%210 = llvm.extractvalue %121[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%211 = llvm.extractvalue %121[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%212 = llvm.extractvalue %121[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%213 = llvm.mul %153, %210 : i64 | |
%214 = llvm.add %212, %213 : i64 | |
%215 = llvm.mul %155, %211 : i64 | |
%216 = llvm.add %214, %215 : i64 | |
%217 = llvm.insertvalue %216, %209[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%218 = llvm.mlir.constant(4 : i64) : i64 | |
%219 = llvm.mlir.constant(1 : i64) : i64 | |
%220 = llvm.insertvalue %218, %217[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%221 = llvm.insertvalue %219, %220[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%222 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%223 = llvm.extractvalue %221[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%224 = llvm.insertvalue %223, %222[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%225 = llvm.extractvalue %221[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%226 = llvm.insertvalue %225, %224[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%227 = llvm.extractvalue %221[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%228 = llvm.insertvalue %227, %226[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%229 = llvm.mlir.constant(1 : index) : i64 | |
%230 = llvm.mlir.constant(4 : index) : i64 | |
%231 = llvm.insertvalue %229, %228[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%232 = llvm.insertvalue %230, %231[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%233 = llvm.mlir.constant(4 : index) : i64 | |
%234 = llvm.insertvalue %233, %232[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%235 = llvm.mlir.constant(1 : index) : i64 | |
%236 = llvm.insertvalue %235, %234[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
llvm.br ^bb4(%1 : i64) | |
^bb4(%237: i64): // 2 preds: ^bb3, ^bb11 | |
%238 = llvm.icmp "slt" %237, %3 : i64 | |
llvm.cond_br %238, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%239 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%240 = llvm.extractvalue %202[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%241 = llvm.bitcast %240 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%242 = llvm.insertvalue %241, %239[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%243 = llvm.extractvalue %202[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%244 = llvm.bitcast %243 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%245 = llvm.insertvalue %244, %242[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%246 = llvm.extractvalue %202[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%247 = llvm.extractvalue %202[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%248 = llvm.extractvalue %202[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%249 = llvm.extractvalue %202[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%250 = llvm.mlir.constant(0 : i64) : i64 | |
%251 = llvm.mul %250, %246 : i64 | |
%252 = llvm.add %249, %251 : i64 | |
%253 = llvm.mul %237, %247 : i64 | |
%254 = llvm.add %252, %253 : i64 | |
%255 = llvm.mlir.constant(0 : i64) : i64 | |
%256 = llvm.mul %255, %248 : i64 | |
%257 = llvm.add %254, %256 : i64 | |
%258 = llvm.insertvalue %257, %245[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%259 = llvm.mlir.constant(128 : i64) : i64 | |
%260 = llvm.mlir.constant(1 : i64) : i64 | |
%261 = llvm.insertvalue %259, %258[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%262 = llvm.insertvalue %260, %261[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%263 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%264 = llvm.extractvalue %236[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%265 = llvm.bitcast %264 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%266 = llvm.insertvalue %265, %263[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%267 = llvm.extractvalue %236[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%268 = llvm.bitcast %267 : !llvm.ptr<i32> to !llvm.ptr<i32> | |
%269 = llvm.insertvalue %268, %266[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%270 = llvm.extractvalue %236[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%271 = llvm.extractvalue %236[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%272 = llvm.extractvalue %236[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%273 = llvm.mlir.constant(0 : i64) : i64 | |
%274 = llvm.mul %273, %270 : i64 | |
%275 = llvm.add %272, %274 : i64 | |
%276 = llvm.mul %237, %271 : i64 | |
%277 = llvm.add %275, %276 : i64 | |
%278 = llvm.insertvalue %277, %269[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%279 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%280 = llvm.extractvalue %262[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%281 = llvm.insertvalue %280, %279[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%282 = llvm.extractvalue %262[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%283 = llvm.insertvalue %282, %281[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%284 = llvm.extractvalue %262[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%285 = llvm.insertvalue %284, %283[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%286 = llvm.mlir.constant(1 : index) : i64 | |
%287 = llvm.mlir.constant(1 : index) : i64 | |
%288 = llvm.mlir.constant(32 : index) : i64 | |
%289 = llvm.mlir.constant(4 : index) : i64 | |
%290 = llvm.insertvalue %286, %285[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%291 = llvm.insertvalue %287, %290[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%292 = llvm.insertvalue %288, %291[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%293 = llvm.insertvalue %289, %292[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%294 = llvm.mlir.constant(128 : index) : i64 | |
%295 = llvm.insertvalue %294, %293[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%296 = llvm.mlir.constant(128 : index) : i64 | |
%297 = llvm.insertvalue %296, %295[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%298 = llvm.mlir.constant(4 : index) : i64 | |
%299 = llvm.insertvalue %298, %297[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%300 = llvm.mlir.constant(1 : index) : i64 | |
%301 = llvm.insertvalue %300, %299[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%302 = llvm.mlir.constant(1 : index) : i64 | |
%303 = llvm.mlir.constant(1 : index) : i64 | |
%304 = llvm.mlir.constant(4 : index) : i64 | |
%305 = llvm.mlir.constant(1 : index) : i64 | |
%306 = llvm.mlir.constant(4 : index) : i64 | |
%307 = llvm.mlir.constant(4 : index) : i64 | |
%308 = llvm.mlir.null : !llvm.ptr<i32> | |
%309 = llvm.getelementptr %308[%307] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%310 = llvm.ptrtoint %309 : !llvm.ptr<i32> to i64 | |
%311 = llvm.alloca %310 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%312 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%313 = llvm.insertvalue %311, %312[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%314 = llvm.insertvalue %311, %313[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%315 = llvm.mlir.constant(0 : index) : i64 | |
%316 = llvm.insertvalue %315, %314[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%317 = llvm.insertvalue %302, %316[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%318 = llvm.insertvalue %303, %317[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%319 = llvm.insertvalue %304, %318[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%320 = llvm.insertvalue %306, %319[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%321 = llvm.insertvalue %304, %320[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%322 = llvm.insertvalue %305, %321[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb6(%1 : i64) | |
^bb6(%323: i64): // 2 preds: ^bb5, ^bb7 | |
%324 = llvm.icmp "slt" %323, %3 : i64 | |
llvm.cond_br %324, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%325 = llvm.extractvalue %89[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%326 = llvm.mlir.constant(4 : index) : i64 | |
%327 = llvm.mul %1, %326 : i64 | |
%328 = llvm.mlir.constant(4 : index) : i64 | |
%329 = llvm.mul %1, %328 : i64 | |
%330 = llvm.add %327, %329 : i64 | |
%331 = llvm.add %330, %323 : i64 | |
%332 = llvm.getelementptr %325[%331] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%333 = llvm.load %332 : !llvm.ptr<i32> | |
%334 = llvm.extractvalue %322[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%335 = llvm.mlir.constant(4 : index) : i64 | |
%336 = llvm.mul %1, %335 : i64 | |
%337 = llvm.mlir.constant(4 : index) : i64 | |
%338 = llvm.mul %1, %337 : i64 | |
%339 = llvm.add %336, %338 : i64 | |
%340 = llvm.add %339, %323 : i64 | |
%341 = llvm.getelementptr %334[%340] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %333, %341 : !llvm.ptr<i32> | |
%342 = llvm.add %323, %4 : i64 | |
llvm.br ^bb6(%342 : i64) | |
^bb8: // pred: ^bb6 | |
%343 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%344 = llvm.extractvalue %322[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%345 = llvm.insertvalue %344, %343[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%346 = llvm.extractvalue %322[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%347 = llvm.insertvalue %346, %345[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%348 = llvm.extractvalue %322[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%349 = llvm.insertvalue %348, %347[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%350 = llvm.mlir.constant(4 : index) : i64 | |
%351 = llvm.insertvalue %350, %349[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%352 = llvm.mlir.constant(1 : index) : i64 | |
%353 = llvm.insertvalue %352, %351[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%354 = llvm.extractvalue %353[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%355 = llvm.getelementptr %354[%1] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%356 = llvm.bitcast %355 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%357 = llvm.load %356 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%1, %357 : i64, vector<4xi32>) | |
^bb9(%358: i64, %359: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%360 = llvm.icmp "slt" %358, %5 : i64 | |
llvm.cond_br %360, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%361 = llvm.extractvalue %301[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%362 = llvm.extractvalue %301[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%363 = llvm.mlir.constant(128 : index) : i64 | |
%364 = llvm.mul %1, %363 : i64 | |
%365 = llvm.add %362, %364 : i64 | |
%366 = llvm.mlir.constant(128 : index) : i64 | |
%367 = llvm.mul %1, %366 : i64 | |
%368 = llvm.add %365, %367 : i64 | |
%369 = llvm.mlir.constant(4 : index) : i64 | |
%370 = llvm.mul %358, %369 : i64 | |
%371 = llvm.add %368, %370 : i64 | |
%372 = llvm.add %371, %1 : i64 | |
%373 = llvm.getelementptr %361[%372] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%374 = llvm.bitcast %373 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%375 = llvm.load %374 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%376 = llvm.add %375, %359 : vector<4xi32> | |
%377 = llvm.add %358, %4 : i64 | |
llvm.br ^bb9(%377, %376 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
%378 = llvm.extractvalue %353[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%379 = llvm.getelementptr %378[%1] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%380 = llvm.bitcast %379 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %359, %380 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%381 = llvm.extractvalue %353[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%382 = llvm.getelementptr %381[%1] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%383 = llvm.bitcast %382 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%384 = llvm.load %383 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%385 = llvm.extractvalue %278[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%386 = llvm.extractvalue %278[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%387 = llvm.getelementptr %385[%386] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%388 = llvm.load %387 : !llvm.ptr<i32> | |
%389 = llvm.mlir.undef : vector<1xi32> | |
%390 = llvm.mlir.constant(0 : i32) : i32 | |
%391 = llvm.insertelement %388, %389[%390 : i32] : vector<1xi32> | |
%392 = llvm.mlir.constant(0 : index) : i64 | |
%393 = llvm.extractelement %391[%392 : i64] : vector<1xi32> | |
%394 = "llvm.intr.vector.reduce.add"(%384) : (vector<4xi32>) -> i32 | |
%395 = llvm.add %393, %394 : i32 | |
%396 = llvm.insertelement %395, %2[%1 : i64] : vector<1xi32> | |
%397 = llvm.mlir.constant(0 : i64) : i64 | |
%398 = llvm.extractelement %396[%397 : i64] : vector<1xi32> | |
%399 = llvm.mlir.undef : vector<1xi32> | |
%400 = llvm.mlir.constant(0 : i32) : i32 | |
%401 = llvm.insertelement %398, %399[%400 : i32] : vector<1xi32> | |
%402 = llvm.mlir.constant(0 : index) : i64 | |
%403 = llvm.extractelement %401[%402 : i64] : vector<1xi32> | |
%404 = llvm.extractvalue %278[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%405 = llvm.extractvalue %278[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%406 = llvm.getelementptr %404[%405] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %403, %406 : !llvm.ptr<i32> | |
%407 = llvm.add %237, %4 : i64 | |
llvm.br ^bb4(%407 : i64) | |
^bb12: // pred: ^bb4 | |
%408 = llvm.add %155, %3 : i64 | |
llvm.br ^bb2(%408 : i64) | |
^bb13: // pred: ^bb2 | |
%409 = llvm.add %153, %4 : i64 | |
llvm.br ^bb1(%409 : i64) | |
^bb14: // pred: ^bb1 | |
%410 = llvm.mlir.constant(0 : i32) : i32 | |
llvm.return %410 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ReconcileUnrealizedCasts (reconcile-unrealized-casts) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 { | |
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%1 = llvm.mlir.constant(0 : index) : i64 | |
%2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%3 = llvm.mlir.constant(4 : index) : i64 | |
%4 = llvm.mlir.constant(1 : index) : i64 | |
%5 = llvm.mlir.constant(32 : index) : i64 | |
%6 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%7 = llvm.extractvalue %6[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%8 = llvm.mlir.constant(0 : i64) : i64 | |
%9 = llvm.load %7 : !llvm.ptr<ptr<i8>> | |
%10 = llvm.bitcast %9 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%11 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%12 = llvm.insertvalue %10, %11[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%13 = llvm.insertvalue %10, %12[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%14 = llvm.mlir.constant(0 : index) : i64 | |
%15 = llvm.insertvalue %14, %13[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%16 = llvm.mlir.constant(512 : index) : i64 | |
%17 = llvm.insertvalue %16, %15[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%18 = llvm.mlir.constant(32768 : index) : i64 | |
%19 = llvm.insertvalue %18, %17[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%20 = llvm.mlir.constant(256 : index) : i64 | |
%21 = llvm.insertvalue %20, %19[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%22 = llvm.mlir.constant(128 : index) : i64 | |
%23 = llvm.insertvalue %22, %21[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%24 = llvm.mlir.constant(128 : index) : i64 | |
%25 = llvm.insertvalue %24, %23[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%26 = llvm.mlir.constant(1 : index) : i64 | |
%27 = llvm.insertvalue %26, %25[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%28 = llvm.mlir.constant(0 : index) : i64 | |
%29 = llvm.mlir.constant(63 : index) : i64 | |
%30 = llvm.ptrtoint %10 : !llvm.ptr<i32> to i64 | |
%31 = llvm.and %30, %29 : i64 | |
%32 = llvm.icmp "eq" %31, %28 : i64 | |
"llvm.intr.assume"(%32) : (i1) -> () | |
%33 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%34 = llvm.extractvalue %33[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%35 = llvm.mlir.constant(1 : i64) : i64 | |
%36 = llvm.getelementptr %34[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%37 = llvm.load %36 : !llvm.ptr<ptr<i8>> | |
%38 = llvm.bitcast %37 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%39 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%40 = llvm.insertvalue %38, %39[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%41 = llvm.insertvalue %38, %40[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%42 = llvm.mlir.constant(0 : index) : i64 | |
%43 = llvm.insertvalue %42, %41[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%44 = llvm.mlir.constant(512 : index) : i64 | |
%45 = llvm.insertvalue %44, %43[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%46 = llvm.mlir.constant(256 : index) : i64 | |
%47 = llvm.insertvalue %46, %45[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%48 = llvm.mlir.constant(256 : index) : i64 | |
%49 = llvm.insertvalue %48, %47[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%50 = llvm.mlir.constant(1 : index) : i64 | |
%51 = llvm.insertvalue %50, %49[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%52 = llvm.mlir.constant(0 : index) : i64 | |
%53 = llvm.mlir.constant(63 : index) : i64 | |
%54 = llvm.ptrtoint %38 : !llvm.ptr<i32> to i64 | |
%55 = llvm.and %54, %53 : i64 | |
%56 = llvm.icmp "eq" %55, %52 : i64 | |
"llvm.intr.assume"(%56) : (i1) -> () | |
%57 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%58 = llvm.extractvalue %57[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%59 = llvm.zext %58 : i32 to i64 | |
%60 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%61 = llvm.extractvalue %60[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%62 = llvm.zext %61 : i32 to i64 | |
%63 = llvm.mlir.constant(32 : index) : i64 | |
%64 = llvm.mul %62, %63 : i64 | |
%65 = llvm.mlir.constant(32 : index) : i64 | |
%66 = llvm.mul %59, %65 : i64 | |
%67 = llvm.mlir.constant(1 : index) : i64 | |
%68 = llvm.mlir.constant(1 : index) : i64 | |
%69 = llvm.mlir.constant(4 : index) : i64 | |
%70 = llvm.mlir.constant(1 : index) : i64 | |
%71 = llvm.mlir.constant(4 : index) : i64 | |
%72 = llvm.mlir.constant(4 : index) : i64 | |
%73 = llvm.mlir.null : !llvm.ptr<i32> | |
%74 = llvm.getelementptr %73[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%75 = llvm.ptrtoint %74 : !llvm.ptr<i32> to i64 | |
%76 = llvm.alloca %75 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%77 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%78 = llvm.insertvalue %76, %77[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%79 = llvm.insertvalue %76, %78[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%80 = llvm.mlir.constant(0 : index) : i64 | |
%81 = llvm.insertvalue %80, %79[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%82 = llvm.insertvalue %67, %81[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%83 = llvm.insertvalue %68, %82[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%84 = llvm.insertvalue %69, %83[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%85 = llvm.insertvalue %71, %84[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%86 = llvm.insertvalue %69, %85[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%87 = llvm.insertvalue %70, %86[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%88 = llvm.mlir.constant(4 : index) : i64 | |
%89 = llvm.mul %1, %88 : i64 | |
%90 = llvm.mlir.constant(4 : index) : i64 | |
%91 = llvm.mul %1, %90 : i64 | |
%92 = llvm.add %89, %91 : i64 | |
%93 = llvm.add %92, %1 : i64 | |
%94 = llvm.getelementptr %76[%93] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%95 = llvm.bitcast %94 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %0, %95 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%96 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%97 = llvm.insertvalue %38, %96[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%98 = llvm.insertvalue %38, %97[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%99 = llvm.mul %64, %46 : i64 | |
%100 = llvm.add %42, %99 : i64 | |
%101 = llvm.mul %66, %50 : i64 | |
%102 = llvm.add %100, %101 : i64 | |
%103 = llvm.insertvalue %102, %98[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%104 = llvm.mlir.constant(32 : i64) : i64 | |
%105 = llvm.mlir.constant(1 : i64) : i64 | |
%106 = llvm.insertvalue %104, %103[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%107 = llvm.insertvalue %105, %106[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%108 = llvm.mlir.constant(32 : i64) : i64 | |
%109 = llvm.mlir.constant(256 : i64) : i64 | |
%110 = llvm.insertvalue %108, %107[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%111 = llvm.insertvalue %109, %110[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%112 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%113 = llvm.insertvalue %10, %112[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%114 = llvm.insertvalue %10, %113[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%115 = llvm.mul %64, %18 : i64 | |
%116 = llvm.add %14, %115 : i64 | |
%117 = llvm.mul %66, %22 : i64 | |
%118 = llvm.add %116, %117 : i64 | |
%119 = llvm.mlir.constant(0 : i64) : i64 | |
%120 = llvm.mul %119, %26 : i64 | |
%121 = llvm.add %118, %120 : i64 | |
%122 = llvm.insertvalue %121, %114[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%123 = llvm.mlir.constant(128 : i64) : i64 | |
%124 = llvm.mlir.constant(1 : i64) : i64 | |
%125 = llvm.insertvalue %123, %122[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%126 = llvm.insertvalue %124, %125[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%127 = llvm.mlir.constant(32 : i64) : i64 | |
%128 = llvm.mlir.constant(128 : i64) : i64 | |
%129 = llvm.insertvalue %127, %126[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%130 = llvm.insertvalue %128, %129[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%131 = llvm.mlir.constant(32 : i64) : i64 | |
%132 = llvm.mlir.constant(32768 : i64) : i64 | |
%133 = llvm.insertvalue %131, %130[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%134 = llvm.insertvalue %132, %133[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb1(%1 : i64) | |
^bb1(%135: i64): // 2 preds: ^bb0, ^bb13 | |
%136 = llvm.icmp "slt" %135, %5 : i64 | |
llvm.cond_br %136, ^bb2(%1 : i64), ^bb14 | |
^bb2(%137: i64): // 2 preds: ^bb1, ^bb12 | |
%138 = llvm.icmp "slt" %137, %5 : i64 | |
llvm.cond_br %138, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%139 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%140 = llvm.insertvalue %10, %139[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%141 = llvm.insertvalue %10, %140[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%142 = llvm.mul %135, %132 : i64 | |
%143 = llvm.add %121, %142 : i64 | |
%144 = llvm.mul %137, %128 : i64 | |
%145 = llvm.add %143, %144 : i64 | |
%146 = llvm.mlir.constant(0 : i64) : i64 | |
%147 = llvm.mul %146, %124 : i64 | |
%148 = llvm.add %145, %147 : i64 | |
%149 = llvm.insertvalue %148, %141[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%150 = llvm.mlir.constant(128 : i64) : i64 | |
%151 = llvm.mlir.constant(1 : i64) : i64 | |
%152 = llvm.insertvalue %150, %149[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%153 = llvm.insertvalue %151, %152[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%154 = llvm.mlir.constant(4 : i64) : i64 | |
%155 = llvm.mlir.constant(128 : i64) : i64 | |
%156 = llvm.insertvalue %154, %153[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%157 = llvm.insertvalue %155, %156[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%158 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%159 = llvm.insertvalue %10, %158[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%160 = llvm.insertvalue %10, %159[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%161 = llvm.insertvalue %148, %160[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%162 = llvm.mlir.constant(1 : index) : i64 | |
%163 = llvm.mlir.constant(4 : index) : i64 | |
%164 = llvm.mlir.constant(128 : index) : i64 | |
%165 = llvm.insertvalue %162, %161[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%166 = llvm.insertvalue %163, %165[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%167 = llvm.insertvalue %164, %166[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%168 = llvm.mlir.constant(512 : index) : i64 | |
%169 = llvm.insertvalue %168, %167[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%170 = llvm.mlir.constant(128 : index) : i64 | |
%171 = llvm.insertvalue %170, %169[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%172 = llvm.mlir.constant(1 : index) : i64 | |
%173 = llvm.insertvalue %172, %171[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%174 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%175 = llvm.insertvalue %38, %174[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%176 = llvm.insertvalue %38, %175[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%177 = llvm.mul %135, %109 : i64 | |
%178 = llvm.add %102, %177 : i64 | |
%179 = llvm.mul %137, %105 : i64 | |
%180 = llvm.add %178, %179 : i64 | |
%181 = llvm.insertvalue %180, %176[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%182 = llvm.mlir.constant(4 : i64) : i64 | |
%183 = llvm.mlir.constant(1 : i64) : i64 | |
%184 = llvm.insertvalue %182, %181[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%185 = llvm.insertvalue %183, %184[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%186 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%187 = llvm.insertvalue %38, %186[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%188 = llvm.insertvalue %38, %187[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%189 = llvm.insertvalue %180, %188[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%190 = llvm.mlir.constant(1 : index) : i64 | |
%191 = llvm.mlir.constant(4 : index) : i64 | |
%192 = llvm.insertvalue %190, %189[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%193 = llvm.insertvalue %191, %192[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%194 = llvm.mlir.constant(4 : index) : i64 | |
%195 = llvm.insertvalue %194, %193[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%196 = llvm.mlir.constant(1 : index) : i64 | |
%197 = llvm.insertvalue %196, %195[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
llvm.br ^bb4(%1 : i64) | |
^bb4(%198: i64): // 2 preds: ^bb3, ^bb11 | |
%199 = llvm.icmp "slt" %198, %3 : i64 | |
llvm.cond_br %199, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%200 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%201 = llvm.insertvalue %10, %200[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%202 = llvm.insertvalue %10, %201[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%203 = llvm.mlir.constant(0 : i64) : i64 | |
%204 = llvm.mul %203, %168 : i64 | |
%205 = llvm.add %148, %204 : i64 | |
%206 = llvm.mul %198, %170 : i64 | |
%207 = llvm.add %205, %206 : i64 | |
%208 = llvm.mlir.constant(0 : i64) : i64 | |
%209 = llvm.mul %208, %172 : i64 | |
%210 = llvm.add %207, %209 : i64 | |
%211 = llvm.insertvalue %210, %202[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%212 = llvm.mlir.constant(128 : i64) : i64 | |
%213 = llvm.mlir.constant(1 : i64) : i64 | |
%214 = llvm.insertvalue %212, %211[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%215 = llvm.insertvalue %213, %214[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%216 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%217 = llvm.insertvalue %38, %216[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%218 = llvm.insertvalue %38, %217[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%219 = llvm.mlir.constant(0 : i64) : i64 | |
%220 = llvm.mul %219, %194 : i64 | |
%221 = llvm.add %180, %220 : i64 | |
%222 = llvm.mul %198, %196 : i64 | |
%223 = llvm.add %221, %222 : i64 | |
%224 = llvm.insertvalue %223, %218[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%225 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%226 = llvm.insertvalue %10, %225[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%227 = llvm.insertvalue %10, %226[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%228 = llvm.insertvalue %210, %227[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%229 = llvm.mlir.constant(1 : index) : i64 | |
%230 = llvm.mlir.constant(1 : index) : i64 | |
%231 = llvm.mlir.constant(32 : index) : i64 | |
%232 = llvm.mlir.constant(4 : index) : i64 | |
%233 = llvm.insertvalue %229, %228[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%234 = llvm.insertvalue %230, %233[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%235 = llvm.insertvalue %231, %234[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%236 = llvm.insertvalue %232, %235[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%237 = llvm.mlir.constant(128 : index) : i64 | |
%238 = llvm.insertvalue %237, %236[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%239 = llvm.mlir.constant(128 : index) : i64 | |
%240 = llvm.insertvalue %239, %238[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%241 = llvm.mlir.constant(4 : index) : i64 | |
%242 = llvm.insertvalue %241, %240[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%243 = llvm.mlir.constant(1 : index) : i64 | |
%244 = llvm.insertvalue %243, %242[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%245 = llvm.mlir.constant(1 : index) : i64 | |
%246 = llvm.mlir.constant(1 : index) : i64 | |
%247 = llvm.mlir.constant(4 : index) : i64 | |
%248 = llvm.mlir.constant(1 : index) : i64 | |
%249 = llvm.mlir.constant(4 : index) : i64 | |
%250 = llvm.mlir.constant(4 : index) : i64 | |
%251 = llvm.mlir.null : !llvm.ptr<i32> | |
%252 = llvm.getelementptr %251[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%253 = llvm.ptrtoint %252 : !llvm.ptr<i32> to i64 | |
%254 = llvm.alloca %253 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%255 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%256 = llvm.insertvalue %254, %255[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%257 = llvm.insertvalue %254, %256[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%258 = llvm.mlir.constant(0 : index) : i64 | |
%259 = llvm.insertvalue %258, %257[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%260 = llvm.insertvalue %245, %259[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%261 = llvm.insertvalue %246, %260[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%262 = llvm.insertvalue %247, %261[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%263 = llvm.insertvalue %249, %262[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%264 = llvm.insertvalue %247, %263[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%265 = llvm.insertvalue %248, %264[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb6(%1 : i64) | |
^bb6(%266: i64): // 2 preds: ^bb5, ^bb7 | |
%267 = llvm.icmp "slt" %266, %3 : i64 | |
llvm.cond_br %267, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%268 = llvm.mlir.constant(4 : index) : i64 | |
%269 = llvm.mul %1, %268 : i64 | |
%270 = llvm.mlir.constant(4 : index) : i64 | |
%271 = llvm.mul %1, %270 : i64 | |
%272 = llvm.add %269, %271 : i64 | |
%273 = llvm.add %272, %266 : i64 | |
%274 = llvm.getelementptr %76[%273] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%275 = llvm.load %274 : !llvm.ptr<i32> | |
%276 = llvm.mlir.constant(4 : index) : i64 | |
%277 = llvm.mul %1, %276 : i64 | |
%278 = llvm.mlir.constant(4 : index) : i64 | |
%279 = llvm.mul %1, %278 : i64 | |
%280 = llvm.add %277, %279 : i64 | |
%281 = llvm.add %280, %266 : i64 | |
%282 = llvm.getelementptr %254[%281] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %275, %282 : !llvm.ptr<i32> | |
%283 = llvm.add %266, %4 : i64 | |
llvm.br ^bb6(%283 : i64) | |
^bb8: // pred: ^bb6 | |
%284 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%285 = llvm.insertvalue %254, %284[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%286 = llvm.insertvalue %254, %285[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%287 = llvm.insertvalue %258, %286[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%288 = llvm.mlir.constant(4 : index) : i64 | |
%289 = llvm.insertvalue %288, %287[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%290 = llvm.mlir.constant(1 : index) : i64 | |
%291 = llvm.insertvalue %290, %289[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%292 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%293 = llvm.load %292 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%1, %293 : i64, vector<4xi32>) | |
^bb9(%294: i64, %295: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%296 = llvm.icmp "slt" %294, %5 : i64 | |
llvm.cond_br %296, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%297 = llvm.mlir.constant(128 : index) : i64 | |
%298 = llvm.mul %1, %297 : i64 | |
%299 = llvm.add %210, %298 : i64 | |
%300 = llvm.mlir.constant(128 : index) : i64 | |
%301 = llvm.mul %1, %300 : i64 | |
%302 = llvm.add %299, %301 : i64 | |
%303 = llvm.mlir.constant(4 : index) : i64 | |
%304 = llvm.mul %294, %303 : i64 | |
%305 = llvm.add %302, %304 : i64 | |
%306 = llvm.add %305, %1 : i64 | |
%307 = llvm.getelementptr %10[%306] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%308 = llvm.bitcast %307 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%309 = llvm.load %308 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%310 = llvm.add %309, %295 : vector<4xi32> | |
%311 = llvm.add %294, %4 : i64 | |
llvm.br ^bb9(%311, %310 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
%312 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %295, %312 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%313 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%314 = llvm.load %313 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%315 = llvm.getelementptr %38[%223] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%316 = llvm.load %315 : !llvm.ptr<i32> | |
%317 = llvm.mlir.undef : vector<1xi32> | |
%318 = llvm.mlir.constant(0 : i32) : i32 | |
%319 = llvm.insertelement %316, %317[%318 : i32] : vector<1xi32> | |
%320 = llvm.mlir.constant(0 : index) : i64 | |
%321 = llvm.extractelement %319[%320 : i64] : vector<1xi32> | |
%322 = "llvm.intr.vector.reduce.add"(%314) : (vector<4xi32>) -> i32 | |
%323 = llvm.add %321, %322 : i32 | |
%324 = llvm.insertelement %323, %2[%1 : i64] : vector<1xi32> | |
%325 = llvm.mlir.constant(0 : i64) : i64 | |
%326 = llvm.extractelement %324[%325 : i64] : vector<1xi32> | |
%327 = llvm.mlir.undef : vector<1xi32> | |
%328 = llvm.mlir.constant(0 : i32) : i32 | |
%329 = llvm.insertelement %326, %327[%328 : i32] : vector<1xi32> | |
%330 = llvm.mlir.constant(0 : index) : i64 | |
%331 = llvm.extractelement %329[%330 : i64] : vector<1xi32> | |
%332 = llvm.getelementptr %38[%223] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %331, %332 : !llvm.ptr<i32> | |
%333 = llvm.add %198, %4 : i64 | |
llvm.br ^bb4(%333 : i64) | |
^bb12: // pred: ^bb4 | |
%334 = llvm.add %137, %3 : i64 | |
llvm.br ^bb2(%334 : i64) | |
^bb13: // pred: ^bb2 | |
%335 = llvm.add %135, %4 : i64 | |
llvm.br ^bb1(%335 : i64) | |
^bb14: // pred: ^bb1 | |
%336 = llvm.mlir.constant(0 : i32) : i32 | |
llvm.return %336 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility (iree-llvmcpu-synchronize-symbol-visibility) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%1 = llvm.mlir.constant(0 : index) : i64 | |
%2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%3 = llvm.mlir.constant(4 : index) : i64 | |
%4 = llvm.mlir.constant(1 : index) : i64 | |
%5 = llvm.mlir.constant(32 : index) : i64 | |
%6 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%7 = llvm.extractvalue %6[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%8 = llvm.mlir.constant(0 : i64) : i64 | |
%9 = llvm.load %7 : !llvm.ptr<ptr<i8>> | |
%10 = llvm.bitcast %9 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%11 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%12 = llvm.insertvalue %10, %11[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%13 = llvm.insertvalue %10, %12[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%14 = llvm.mlir.constant(0 : index) : i64 | |
%15 = llvm.insertvalue %14, %13[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%16 = llvm.mlir.constant(512 : index) : i64 | |
%17 = llvm.insertvalue %16, %15[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%18 = llvm.mlir.constant(32768 : index) : i64 | |
%19 = llvm.insertvalue %18, %17[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%20 = llvm.mlir.constant(256 : index) : i64 | |
%21 = llvm.insertvalue %20, %19[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%22 = llvm.mlir.constant(128 : index) : i64 | |
%23 = llvm.insertvalue %22, %21[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%24 = llvm.mlir.constant(128 : index) : i64 | |
%25 = llvm.insertvalue %24, %23[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%26 = llvm.mlir.constant(1 : index) : i64 | |
%27 = llvm.insertvalue %26, %25[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%28 = llvm.mlir.constant(0 : index) : i64 | |
%29 = llvm.mlir.constant(63 : index) : i64 | |
%30 = llvm.ptrtoint %10 : !llvm.ptr<i32> to i64 | |
%31 = llvm.and %30, %29 : i64 | |
%32 = llvm.icmp "eq" %31, %28 : i64 | |
"llvm.intr.assume"(%32) : (i1) -> () | |
%33 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%34 = llvm.extractvalue %33[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%35 = llvm.mlir.constant(1 : i64) : i64 | |
%36 = llvm.getelementptr %34[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%37 = llvm.load %36 : !llvm.ptr<ptr<i8>> | |
%38 = llvm.bitcast %37 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%39 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%40 = llvm.insertvalue %38, %39[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%41 = llvm.insertvalue %38, %40[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%42 = llvm.mlir.constant(0 : index) : i64 | |
%43 = llvm.insertvalue %42, %41[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%44 = llvm.mlir.constant(512 : index) : i64 | |
%45 = llvm.insertvalue %44, %43[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%46 = llvm.mlir.constant(256 : index) : i64 | |
%47 = llvm.insertvalue %46, %45[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%48 = llvm.mlir.constant(256 : index) : i64 | |
%49 = llvm.insertvalue %48, %47[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%50 = llvm.mlir.constant(1 : index) : i64 | |
%51 = llvm.insertvalue %50, %49[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%52 = llvm.mlir.constant(0 : index) : i64 | |
%53 = llvm.mlir.constant(63 : index) : i64 | |
%54 = llvm.ptrtoint %38 : !llvm.ptr<i32> to i64 | |
%55 = llvm.and %54, %53 : i64 | |
%56 = llvm.icmp "eq" %55, %52 : i64 | |
"llvm.intr.assume"(%56) : (i1) -> () | |
%57 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%58 = llvm.extractvalue %57[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%59 = llvm.zext %58 : i32 to i64 | |
%60 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%61 = llvm.extractvalue %60[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%62 = llvm.zext %61 : i32 to i64 | |
%63 = llvm.mlir.constant(32 : index) : i64 | |
%64 = llvm.mul %62, %63 : i64 | |
%65 = llvm.mlir.constant(32 : index) : i64 | |
%66 = llvm.mul %59, %65 : i64 | |
%67 = llvm.mlir.constant(1 : index) : i64 | |
%68 = llvm.mlir.constant(1 : index) : i64 | |
%69 = llvm.mlir.constant(4 : index) : i64 | |
%70 = llvm.mlir.constant(1 : index) : i64 | |
%71 = llvm.mlir.constant(4 : index) : i64 | |
%72 = llvm.mlir.constant(4 : index) : i64 | |
%73 = llvm.mlir.null : !llvm.ptr<i32> | |
%74 = llvm.getelementptr %73[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%75 = llvm.ptrtoint %74 : !llvm.ptr<i32> to i64 | |
%76 = llvm.alloca %75 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%77 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%78 = llvm.insertvalue %76, %77[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%79 = llvm.insertvalue %76, %78[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%80 = llvm.mlir.constant(0 : index) : i64 | |
%81 = llvm.insertvalue %80, %79[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%82 = llvm.insertvalue %67, %81[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%83 = llvm.insertvalue %68, %82[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%84 = llvm.insertvalue %69, %83[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%85 = llvm.insertvalue %71, %84[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%86 = llvm.insertvalue %69, %85[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%87 = llvm.insertvalue %70, %86[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%88 = llvm.mlir.constant(4 : index) : i64 | |
%89 = llvm.mul %1, %88 : i64 | |
%90 = llvm.mlir.constant(4 : index) : i64 | |
%91 = llvm.mul %1, %90 : i64 | |
%92 = llvm.add %89, %91 : i64 | |
%93 = llvm.add %92, %1 : i64 | |
%94 = llvm.getelementptr %76[%93] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%95 = llvm.bitcast %94 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %0, %95 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%96 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%97 = llvm.insertvalue %38, %96[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%98 = llvm.insertvalue %38, %97[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%99 = llvm.mul %64, %46 : i64 | |
%100 = llvm.add %42, %99 : i64 | |
%101 = llvm.mul %66, %50 : i64 | |
%102 = llvm.add %100, %101 : i64 | |
%103 = llvm.insertvalue %102, %98[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%104 = llvm.mlir.constant(32 : i64) : i64 | |
%105 = llvm.mlir.constant(1 : i64) : i64 | |
%106 = llvm.insertvalue %104, %103[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%107 = llvm.insertvalue %105, %106[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%108 = llvm.mlir.constant(32 : i64) : i64 | |
%109 = llvm.mlir.constant(256 : i64) : i64 | |
%110 = llvm.insertvalue %108, %107[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%111 = llvm.insertvalue %109, %110[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%112 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%113 = llvm.insertvalue %10, %112[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%114 = llvm.insertvalue %10, %113[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%115 = llvm.mul %64, %18 : i64 | |
%116 = llvm.add %14, %115 : i64 | |
%117 = llvm.mul %66, %22 : i64 | |
%118 = llvm.add %116, %117 : i64 | |
%119 = llvm.mlir.constant(0 : i64) : i64 | |
%120 = llvm.mul %119, %26 : i64 | |
%121 = llvm.add %118, %120 : i64 | |
%122 = llvm.insertvalue %121, %114[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%123 = llvm.mlir.constant(128 : i64) : i64 | |
%124 = llvm.mlir.constant(1 : i64) : i64 | |
%125 = llvm.insertvalue %123, %122[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%126 = llvm.insertvalue %124, %125[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%127 = llvm.mlir.constant(32 : i64) : i64 | |
%128 = llvm.mlir.constant(128 : i64) : i64 | |
%129 = llvm.insertvalue %127, %126[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%130 = llvm.insertvalue %128, %129[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%131 = llvm.mlir.constant(32 : i64) : i64 | |
%132 = llvm.mlir.constant(32768 : i64) : i64 | |
%133 = llvm.insertvalue %131, %130[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%134 = llvm.insertvalue %132, %133[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb1(%1 : i64) | |
^bb1(%135: i64): // 2 preds: ^bb0, ^bb13 | |
%136 = llvm.icmp "slt" %135, %5 : i64 | |
llvm.cond_br %136, ^bb2(%1 : i64), ^bb14 | |
^bb2(%137: i64): // 2 preds: ^bb1, ^bb12 | |
%138 = llvm.icmp "slt" %137, %5 : i64 | |
llvm.cond_br %138, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%139 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%140 = llvm.insertvalue %10, %139[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%141 = llvm.insertvalue %10, %140[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%142 = llvm.mul %135, %132 : i64 | |
%143 = llvm.add %121, %142 : i64 | |
%144 = llvm.mul %137, %128 : i64 | |
%145 = llvm.add %143, %144 : i64 | |
%146 = llvm.mlir.constant(0 : i64) : i64 | |
%147 = llvm.mul %146, %124 : i64 | |
%148 = llvm.add %145, %147 : i64 | |
%149 = llvm.insertvalue %148, %141[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%150 = llvm.mlir.constant(128 : i64) : i64 | |
%151 = llvm.mlir.constant(1 : i64) : i64 | |
%152 = llvm.insertvalue %150, %149[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%153 = llvm.insertvalue %151, %152[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%154 = llvm.mlir.constant(4 : i64) : i64 | |
%155 = llvm.mlir.constant(128 : i64) : i64 | |
%156 = llvm.insertvalue %154, %153[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%157 = llvm.insertvalue %155, %156[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%158 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%159 = llvm.insertvalue %10, %158[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%160 = llvm.insertvalue %10, %159[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%161 = llvm.insertvalue %148, %160[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%162 = llvm.mlir.constant(1 : index) : i64 | |
%163 = llvm.mlir.constant(4 : index) : i64 | |
%164 = llvm.mlir.constant(128 : index) : i64 | |
%165 = llvm.insertvalue %162, %161[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%166 = llvm.insertvalue %163, %165[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%167 = llvm.insertvalue %164, %166[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%168 = llvm.mlir.constant(512 : index) : i64 | |
%169 = llvm.insertvalue %168, %167[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%170 = llvm.mlir.constant(128 : index) : i64 | |
%171 = llvm.insertvalue %170, %169[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%172 = llvm.mlir.constant(1 : index) : i64 | |
%173 = llvm.insertvalue %172, %171[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%174 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%175 = llvm.insertvalue %38, %174[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%176 = llvm.insertvalue %38, %175[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%177 = llvm.mul %135, %109 : i64 | |
%178 = llvm.add %102, %177 : i64 | |
%179 = llvm.mul %137, %105 : i64 | |
%180 = llvm.add %178, %179 : i64 | |
%181 = llvm.insertvalue %180, %176[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%182 = llvm.mlir.constant(4 : i64) : i64 | |
%183 = llvm.mlir.constant(1 : i64) : i64 | |
%184 = llvm.insertvalue %182, %181[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%185 = llvm.insertvalue %183, %184[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%186 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%187 = llvm.insertvalue %38, %186[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%188 = llvm.insertvalue %38, %187[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%189 = llvm.insertvalue %180, %188[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%190 = llvm.mlir.constant(1 : index) : i64 | |
%191 = llvm.mlir.constant(4 : index) : i64 | |
%192 = llvm.insertvalue %190, %189[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%193 = llvm.insertvalue %191, %192[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%194 = llvm.mlir.constant(4 : index) : i64 | |
%195 = llvm.insertvalue %194, %193[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
%196 = llvm.mlir.constant(1 : index) : i64 | |
%197 = llvm.insertvalue %196, %195[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)> | |
llvm.br ^bb4(%1 : i64) | |
^bb4(%198: i64): // 2 preds: ^bb3, ^bb11 | |
%199 = llvm.icmp "slt" %198, %3 : i64 | |
llvm.cond_br %199, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%200 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%201 = llvm.insertvalue %10, %200[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%202 = llvm.insertvalue %10, %201[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%203 = llvm.mlir.constant(0 : i64) : i64 | |
%204 = llvm.mul %203, %168 : i64 | |
%205 = llvm.add %148, %204 : i64 | |
%206 = llvm.mul %198, %170 : i64 | |
%207 = llvm.add %205, %206 : i64 | |
%208 = llvm.mlir.constant(0 : i64) : i64 | |
%209 = llvm.mul %208, %172 : i64 | |
%210 = llvm.add %207, %209 : i64 | |
%211 = llvm.insertvalue %210, %202[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%212 = llvm.mlir.constant(128 : i64) : i64 | |
%213 = llvm.mlir.constant(1 : i64) : i64 | |
%214 = llvm.insertvalue %212, %211[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%215 = llvm.insertvalue %213, %214[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%216 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%217 = llvm.insertvalue %38, %216[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%218 = llvm.insertvalue %38, %217[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%219 = llvm.mlir.constant(0 : i64) : i64 | |
%220 = llvm.mul %219, %194 : i64 | |
%221 = llvm.add %180, %220 : i64 | |
%222 = llvm.mul %198, %196 : i64 | |
%223 = llvm.add %221, %222 : i64 | |
%224 = llvm.insertvalue %223, %218[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)> | |
%225 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%226 = llvm.insertvalue %10, %225[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%227 = llvm.insertvalue %10, %226[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%228 = llvm.insertvalue %210, %227[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%229 = llvm.mlir.constant(1 : index) : i64 | |
%230 = llvm.mlir.constant(1 : index) : i64 | |
%231 = llvm.mlir.constant(32 : index) : i64 | |
%232 = llvm.mlir.constant(4 : index) : i64 | |
%233 = llvm.insertvalue %229, %228[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%234 = llvm.insertvalue %230, %233[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%235 = llvm.insertvalue %231, %234[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%236 = llvm.insertvalue %232, %235[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%237 = llvm.mlir.constant(128 : index) : i64 | |
%238 = llvm.insertvalue %237, %236[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%239 = llvm.mlir.constant(128 : index) : i64 | |
%240 = llvm.insertvalue %239, %238[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%241 = llvm.mlir.constant(4 : index) : i64 | |
%242 = llvm.insertvalue %241, %240[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%243 = llvm.mlir.constant(1 : index) : i64 | |
%244 = llvm.insertvalue %243, %242[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)> | |
%245 = llvm.mlir.constant(1 : index) : i64 | |
%246 = llvm.mlir.constant(1 : index) : i64 | |
%247 = llvm.mlir.constant(4 : index) : i64 | |
%248 = llvm.mlir.constant(1 : index) : i64 | |
%249 = llvm.mlir.constant(4 : index) : i64 | |
%250 = llvm.mlir.constant(4 : index) : i64 | |
%251 = llvm.mlir.null : !llvm.ptr<i32> | |
%252 = llvm.getelementptr %251[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%253 = llvm.ptrtoint %252 : !llvm.ptr<i32> to i64 | |
%254 = llvm.alloca %253 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%255 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%256 = llvm.insertvalue %254, %255[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%257 = llvm.insertvalue %254, %256[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%258 = llvm.mlir.constant(0 : index) : i64 | |
%259 = llvm.insertvalue %258, %257[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%260 = llvm.insertvalue %245, %259[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%261 = llvm.insertvalue %246, %260[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%262 = llvm.insertvalue %247, %261[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%263 = llvm.insertvalue %249, %262[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%264 = llvm.insertvalue %247, %263[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
%265 = llvm.insertvalue %248, %264[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)> | |
llvm.br ^bb6(%1 : i64) | |
^bb6(%266: i64): // 2 preds: ^bb5, ^bb7 | |
%267 = llvm.icmp "slt" %266, %3 : i64 | |
llvm.cond_br %267, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%268 = llvm.mlir.constant(4 : index) : i64 | |
%269 = llvm.mul %1, %268 : i64 | |
%270 = llvm.mlir.constant(4 : index) : i64 | |
%271 = llvm.mul %1, %270 : i64 | |
%272 = llvm.add %269, %271 : i64 | |
%273 = llvm.add %272, %266 : i64 | |
%274 = llvm.getelementptr %76[%273] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%275 = llvm.load %274 : !llvm.ptr<i32> | |
%276 = llvm.mlir.constant(4 : index) : i64 | |
%277 = llvm.mul %1, %276 : i64 | |
%278 = llvm.mlir.constant(4 : index) : i64 | |
%279 = llvm.mul %1, %278 : i64 | |
%280 = llvm.add %277, %279 : i64 | |
%281 = llvm.add %280, %266 : i64 | |
%282 = llvm.getelementptr %254[%281] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %275, %282 : !llvm.ptr<i32> | |
%283 = llvm.add %266, %4 : i64 | |
llvm.br ^bb6(%283 : i64) | |
^bb8: // pred: ^bb6 | |
%284 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%285 = llvm.insertvalue %254, %284[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%286 = llvm.insertvalue %254, %285[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%287 = llvm.insertvalue %258, %286[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%288 = llvm.mlir.constant(4 : index) : i64 | |
%289 = llvm.insertvalue %288, %287[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%290 = llvm.mlir.constant(1 : index) : i64 | |
%291 = llvm.insertvalue %290, %289[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> | |
%292 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%293 = llvm.load %292 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%1, %293 : i64, vector<4xi32>) | |
^bb9(%294: i64, %295: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%296 = llvm.icmp "slt" %294, %5 : i64 | |
llvm.cond_br %296, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%297 = llvm.mlir.constant(128 : index) : i64 | |
%298 = llvm.mul %1, %297 : i64 | |
%299 = llvm.add %210, %298 : i64 | |
%300 = llvm.mlir.constant(128 : index) : i64 | |
%301 = llvm.mul %1, %300 : i64 | |
%302 = llvm.add %299, %301 : i64 | |
%303 = llvm.mlir.constant(4 : index) : i64 | |
%304 = llvm.mul %294, %303 : i64 | |
%305 = llvm.add %302, %304 : i64 | |
%306 = llvm.add %305, %1 : i64 | |
%307 = llvm.getelementptr %10[%306] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%308 = llvm.bitcast %307 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%309 = llvm.load %308 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%310 = llvm.add %309, %295 : vector<4xi32> | |
%311 = llvm.add %294, %4 : i64 | |
llvm.br ^bb9(%311, %310 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
%312 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %295, %312 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%313 = llvm.bitcast %254 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%314 = llvm.load %313 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%315 = llvm.getelementptr %38[%223] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%316 = llvm.load %315 : !llvm.ptr<i32> | |
%317 = llvm.mlir.undef : vector<1xi32> | |
%318 = llvm.mlir.constant(0 : i32) : i32 | |
%319 = llvm.insertelement %316, %317[%318 : i32] : vector<1xi32> | |
%320 = llvm.mlir.constant(0 : index) : i64 | |
%321 = llvm.extractelement %319[%320 : i64] : vector<1xi32> | |
%322 = "llvm.intr.vector.reduce.add"(%314) : (vector<4xi32>) -> i32 | |
%323 = llvm.add %321, %322 : i32 | |
%324 = llvm.insertelement %323, %2[%1 : i64] : vector<1xi32> | |
%325 = llvm.mlir.constant(0 : i64) : i64 | |
%326 = llvm.extractelement %324[%325 : i64] : vector<1xi32> | |
%327 = llvm.mlir.undef : vector<1xi32> | |
%328 = llvm.mlir.constant(0 : i32) : i32 | |
%329 = llvm.insertelement %326, %327[%328 : i32] : vector<1xi32> | |
%330 = llvm.mlir.constant(0 : index) : i64 | |
%331 = llvm.extractelement %329[%330 : i64] : vector<1xi32> | |
%332 = llvm.getelementptr %38[%223] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %331, %332 : !llvm.ptr<i32> | |
%333 = llvm.add %198, %4 : i64 | |
llvm.br ^bb4(%333 : i64) | |
^bb12: // pred: ^bb4 | |
%334 = llvm.add %137, %3 : i64 | |
llvm.br ^bb2(%334 : i64) | |
^bb13: // pred: ^bb2 | |
%335 = llvm.add %135, %4 : i64 | |
llvm.br ^bb1(%335 : i64) | |
^bb14: // pred: ^bb1 | |
%336 = llvm.mlir.constant(0 : i32) : i32 | |
llvm.return %336 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%36 = llvm.extractvalue %35[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%37 = llvm.zext %36 : i32 to i64 | |
%38 = llvm.mul %37, %16 : i64 | |
%39 = llvm.mul %34, %16 : i64 | |
%40 = llvm.mlir.null : !llvm.ptr<i32> | |
%41 = llvm.getelementptr %40[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%42 = llvm.ptrtoint %41 : !llvm.ptr<i32> to i64 | |
%43 = llvm.alloca %42 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%44 = llvm.mul %12, %14 : i64 | |
%45 = llvm.mul %12, %14 : i64 | |
%46 = llvm.add %44, %45 : i64 | |
%47 = llvm.add %46, %12 : i64 | |
%48 = llvm.getelementptr %43[%47] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%49 = llvm.bitcast %48 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %49 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%50 = llvm.mul %38, %7 : i64 | |
%51 = llvm.add %50, %12 : i64 | |
%52 = llvm.mul %39, %15 : i64 | |
%53 = llvm.add %51, %52 : i64 | |
%54 = llvm.mul %38, %8 : i64 | |
%55 = llvm.add %54, %12 : i64 | |
%56 = llvm.mul %39, %6 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
%58 = llvm.mul %10, %15 : i64 | |
%59 = llvm.add %57, %58 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%60: i64): // 2 preds: ^bb0, ^bb13 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb2(%12 : i64), ^bb14 | |
^bb2(%62: i64): // 2 preds: ^bb1, ^bb12 | |
%63 = llvm.icmp "slt" %62, %16 : i64 | |
llvm.cond_br %63, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%64 = llvm.mul %60, %1 : i64 | |
%65 = llvm.add %59, %64 : i64 | |
%66 = llvm.mul %62, %2 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %10, %4 : i64 | |
%69 = llvm.add %67, %68 : i64 | |
%70 = llvm.mul %60, %3 : i64 | |
%71 = llvm.add %53, %70 : i64 | |
%72 = llvm.mul %62, %4 : i64 | |
%73 = llvm.add %71, %72 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%74: i64): // 2 preds: ^bb3, ^bb11 | |
%75 = llvm.icmp "slt" %74, %14 : i64 | |
llvm.cond_br %75, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%76 = llvm.mul %10, %9 : i64 | |
%77 = llvm.add %69, %76 : i64 | |
%78 = llvm.mul %74, %6 : i64 | |
%79 = llvm.add %77, %78 : i64 | |
%80 = llvm.mul %10, %15 : i64 | |
%81 = llvm.add %79, %80 : i64 | |
%82 = llvm.mul %10, %14 : i64 | |
%83 = llvm.add %73, %82 : i64 | |
%84 = llvm.mul %74, %15 : i64 | |
%85 = llvm.add %83, %84 : i64 | |
%86 = llvm.mlir.null : !llvm.ptr<i32> | |
%87 = llvm.getelementptr %86[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%88 = llvm.ptrtoint %87 : !llvm.ptr<i32> to i64 | |
%89 = llvm.alloca %88 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%90: i64): // 2 preds: ^bb5, ^bb7 | |
%91 = llvm.icmp "slt" %90, %14 : i64 | |
llvm.cond_br %91, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%92 = llvm.mul %12, %14 : i64 | |
%93 = llvm.mul %12, %14 : i64 | |
%94 = llvm.add %92, %93 : i64 | |
%95 = llvm.add %94, %90 : i64 | |
%96 = llvm.getelementptr %43[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%97 = llvm.load %96 : !llvm.ptr<i32> | |
%98 = llvm.mul %12, %14 : i64 | |
%99 = llvm.mul %12, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %90 : i64 | |
%102 = llvm.getelementptr %89[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %97, %102 : !llvm.ptr<i32> | |
%103 = llvm.add %90, %15 : i64 | |
llvm.br ^bb6(%103 : i64) | |
^bb8: // pred: ^bb6 | |
%104 = llvm.bitcast %89 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%105 = llvm.load %104 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %105 : i64, vector<4xi32>) | |
^bb9(%106: i64, %107: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%108 = llvm.icmp "slt" %106, %16 : i64 | |
llvm.cond_br %108, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%109 = llvm.mul %12, %6 : i64 | |
%110 = llvm.add %81, %109 : i64 | |
%111 = llvm.mul %12, %6 : i64 | |
%112 = llvm.add %110, %111 : i64 | |
%113 = llvm.mul %106, %14 : i64 | |
%114 = llvm.add %112, %113 : i64 | |
%115 = llvm.add %114, %12 : i64 | |
%116 = llvm.getelementptr %20[%115] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%117 = llvm.bitcast %116 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%118 = llvm.load %117 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%119 = llvm.add %118, %107 : vector<4xi32> | |
%120 = llvm.add %106, %15 : i64 | |
llvm.br ^bb9(%120, %119 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
%121 = llvm.bitcast %89 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %107, %121 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%122 = llvm.bitcast %89 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%123 = llvm.load %122 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%124 = llvm.getelementptr %28[%85] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%125 = llvm.load %124 : !llvm.ptr<i32> | |
%126 = llvm.mlir.undef : vector<1xi32> | |
%127 = llvm.insertelement %125, %126[%0 : i32] : vector<1xi32> | |
%128 = llvm.extractelement %127[%12 : i64] : vector<1xi32> | |
%129 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32 | |
%130 = llvm.add %128, %129 : i32 | |
%131 = llvm.insertelement %130, %13[%12 : i64] : vector<1xi32> | |
%132 = llvm.extractelement %131[%10 : i64] : vector<1xi32> | |
%133 = llvm.mlir.undef : vector<1xi32> | |
%134 = llvm.insertelement %132, %133[%0 : i32] : vector<1xi32> | |
%135 = llvm.extractelement %134[%12 : i64] : vector<1xi32> | |
%136 = llvm.getelementptr %28[%85] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %135, %136 : !llvm.ptr<i32> | |
%137 = llvm.add %74, %15 : i64 | |
llvm.br ^bb4(%137 : i64) | |
^bb12: // pred: ^bb4 | |
%138 = llvm.add %62, %14 : i64 | |
llvm.br ^bb2(%138 : i64) | |
^bb13: // pred: ^bb2 | |
%139 = llvm.add %60, %15 : i64 | |
llvm.br ^bb1(%139 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) ('hal.executable' operation: @_split_reduction_pass2_dispatch_0) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864} | |
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) { | |
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864} | |
%3 = util.do_not_optimize(%2) : !stream.resource<transient> | |
%4 = stream.resource.size %3 : !stream.resource<transient> | |
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576} | |
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] { | |
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4}, | |
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576} | |
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} | |
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576} | |
} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576} | |
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288} | |
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32> | |
check.expect_eq(%10, %11) : tensor<512x256xi32> | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass (iree-hal-conversion) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%device_0 = hal.ex.shared_device : !hal.device | |
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%c-1_i64 = arith.constant -1 : i64 | |
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%c-1_i32 = arith.constant -1 : i32 | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%device_1 = hal.ex.shared_device : !hal.device | |
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator | |
%buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%device_4 = hal.ex.shared_device : !hal.device | |
%cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%2 = hal.command_buffer.device<%cmd_5 : !hal.command_buffer> : !hal.device | |
hal.device.switch<%2 : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout | |
%c0_22 = arith.constant 0 : index | |
%c1_23 = arith.constant 1 : index | |
%c0_24 = arith.constant 0 : index | |
hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_24] bindings([ | |
%c0_22 = (%1 : !hal.buffer)[%c0, %len], | |
%c1_23 = (%buffer_3 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%c1_25 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_25]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer> | |
%3 = util.null : !hal.fence | |
%fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence | |
%c-1_i64_7 = arith.constant -1 : i64 | |
hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64_7) wait(%3) signal(%fence_6) commands([%cmd_5]) | |
%c-1_i32_8 = arith.constant -1 : i32 | |
%status_9 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32_8) : i32 | |
util.status.check_ok %status_9, "failed to wait on timepoint" | |
%buffer_10 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c0, %c524288] : !hal.buffer | |
%buffer_11 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c524288, %c524288] : !hal.buffer | |
%c512_12 = arith.constant 512 : index | |
%c256_13 = arith.constant 256 : index | |
%c0_14 = arith.constant 0 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c1_i32_15 = arith.constant 1 : i32 | |
%view = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_14, %c524288] shape([%c512_12, %c256_13]) type(%c268435488_i32) encoding(%c1_i32_15) : !hal.buffer_view | |
%c512_16 = arith.constant 512 : index | |
%c256_17 = arith.constant 256 : index | |
%c0_18 = arith.constant 0 : index | |
%c268435488_i32_19 = arith.constant 268435488 : i32 | |
%c1_i32_20 = arith.constant 1 : i32 | |
%view_21 = hal.buffer_view.create buffer(%buffer_11 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32_19) encoding(%c1_i32_20) : !hal.buffer_view | |
check.expect_eq(%view, %view_21) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::FixupLegacySyncPass (iree-hal-fixup-legacy-sync) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%device_0 = hal.ex.shared_device : !hal.device | |
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%status = hal.fence.await until([%0]) timeout_millis(%c-1_i32) : i32 | |
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%c-1_i32_1 = arith.constant -1 : i32 | |
%status_2 = hal.fence.await until([%fence]) timeout_millis(%c-1_i32_1) : i32 | |
util.status.check_ok %status_2, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%device_3 = hal.ex.shared_device : !hal.device | |
%allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator | |
%buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%device_6 = hal.ex.shared_device : !hal.device | |
%cmd_7 = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%2 = hal.command_buffer.device<%cmd_7 : !hal.command_buffer> : !hal.device | |
hal.device.switch<%2 : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout | |
%c0_26 = arith.constant 0 : index | |
%c1_27 = arith.constant 1 : index | |
%c0_28 = arith.constant 0 : index | |
hal.command_buffer.push_descriptor_set<%cmd_7 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_28] bindings([ | |
%c0_26 = (%1 : !hal.buffer)[%c0, %len], | |
%c1_27 = (%buffer_5 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%c1_29 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.command_buffer.dispatch.symbol<%cmd_7 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_29]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_7 : !hal.command_buffer> target(%buffer_5 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_7 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_7 : !hal.command_buffer> | |
%3 = util.null : !hal.fence | |
%fence_8 = hal.fence.create device(%device_6 : !hal.device) flags("None") : !hal.fence | |
%c-1_i64_9 = arith.constant -1 : i64 | |
%c-1_i32_10 = arith.constant -1 : i32 | |
%status_11 = hal.fence.await until([%3]) timeout_millis(%c-1_i32_10) : i32 | |
hal.device.queue.execute<%device_6 : !hal.device> affinity(%c-1_i64_9) wait(%3) signal(%fence_8) commands([%cmd_7]) | |
%c-1_i32_12 = arith.constant -1 : i32 | |
%status_13 = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32_12) : i32 | |
util.status.check_ok %status_13, "failed to wait on timepoint" | |
%buffer_14 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c0, %c524288] : !hal.buffer | |
%buffer_15 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c524288, %c524288] : !hal.buffer | |
%c512_16 = arith.constant 512 : index | |
%c256_17 = arith.constant 256 : index | |
%c0_18 = arith.constant 0 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c1_i32_19 = arith.constant 1 : i32 | |
%view = hal.buffer_view.create buffer(%buffer_14 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32) encoding(%c1_i32_19) : !hal.buffer_view | |
%c512_20 = arith.constant 512 : index | |
%c256_21 = arith.constant 256 : index | |
%c0_22 = arith.constant 0 : index | |
%c268435488_i32_23 = arith.constant 268435488 : i32 | |
%c1_i32_24 = arith.constant 1 : i32 | |
%view_25 = hal.buffer_view.create buffer(%buffer_15 : !hal.buffer)[%c0_22, %c524288] shape([%c512_20, %c256_21]) type(%c268435488_i32_23) encoding(%c1_i32_24) : !hal.buffer_view | |
check.expect_eq(%view, %view_25) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%device_0 = hal.ex.shared_device : !hal.device | |
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%device_1 = hal.ex.shared_device : !hal.device | |
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator | |
%buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%device_4 = hal.ex.shared_device : !hal.device | |
%cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
hal.device.switch<%device_4 : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%pipeline_layout = hal.pipeline_layout.lookup device(%device_4 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_3 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer> | |
%2 = util.null : !hal.fence | |
%fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64) wait(%2) signal(%fence_6) commands([%cmd_5]) | |
%status_7 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_7, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_8 = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_8) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
hal.device.switch<%device : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch.symbol<%cmd_1 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
hal.device.switch<%device : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device | |
%exe = hal.executable.lookup device(%2 : !hal.device) executable(@_split_reduction_pass2_dispatch_0) : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%0 = hal.device.switch<%device : !hal.device> -> !hal.executable | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
hal.return %exe : !hal.executable | |
}, | |
#hal.match.always { | |
%1 = util.null : !hal.executable | |
hal.return %1 : !hal.executable | |
} | |
util.global.store %0, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
hal.device.switch<%device : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('util.initializer' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
cf.cond_br %value, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb5(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%true = arith.constant true | |
cf.cond_br %true, ^bb3, ^bb4 | |
^bb3: // pred: ^bb2 | |
%0 = util.null : !hal.executable | |
cf.br ^bb5(%0 : !hal.executable) | |
^bb4: // pred: ^bb2 | |
util.unreachable "device not supported in the compiled configuration" | |
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
hal.device.switch<%device : !hal.device> | |
#hal.device.match.executable.format<"embedded-elf-x86_64"> { | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.return | |
} | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
cf.cond_br %value, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb5(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%true = arith.constant true | |
cf.cond_br %true, ^bb3, ^bb4 | |
^bb3: // pred: ^bb2 | |
%0 = util.null : !hal.executable | |
cf.br ^bb5(%0 : !hal.executable) | |
^bb4: // pred: ^bb2 | |
util.unreachable "device not supported in the compiled configuration" | |
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
cf.cond_br %value, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
cf.br ^bb3 | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
^bb3: // pred: ^bb1 | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.global.store %value, @_device_query_0 : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb5(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%true = arith.constant true | |
cf.cond_br %true, ^bb3, ^bb4 | |
^bb3: // pred: ^bb2 | |
%0 = util.null : !hal.executable | |
cf.br ^bb5(%0 : !hal.executable) | |
^bb4: // pred: ^bb2 | |
util.unreachable "device not supported in the compiled configuration" | |
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
cf.br ^bb3 | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
^bb3: // pred: ^bb1 | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.global.store %value, @_device_query_0 : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1048576 = arith.constant 1048576 : index | |
%c0 = arith.constant 0 : index | |
%c524288 = arith.constant 524288 : index | |
%c67108864 = arith.constant 67108864 : index | |
%c1_i32 = arith.constant 1 : i32 | |
%c128_i32 = arith.constant 128 : i32 | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c1 = arith.constant 1 : index | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_device_query_0_ok : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %ok, @_device_query_0_ok : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
util.initializer.return | |
} | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.initializer.return | |
} | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer { | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
util.initializer.return | |
} | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
%device_0 = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device_1 = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device_2 = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
cf.br ^bb4 | |
^bb4: // pred: ^bb3 | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
%device_0 = hal.ex.shared_device : !hal.device | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%device_1 = hal.ex.shared_device : !hal.device | |
%pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%device_2 = hal.ex.shared_device : !hal.device | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
util.global.store %value, @_device_query_0 : i1 | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
cf.cond_br %value, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: ^bb4 | |
%74 = llvm.mul %10, %9 : i64 | |
%75 = llvm.add %67, %74 : i64 | |
%76 = llvm.mul %72, %6 : i64 | |
%77 = llvm.add %75, %76 : i64 | |
%78 = llvm.add %77, %56 : i64 | |
%79 = llvm.mul %10, %14 : i64 | |
%80 = llvm.add %71, %79 : i64 | |
%81 = llvm.mul %72, %15 : i64 | |
%82 = llvm.add %80, %81 : i64 | |
%83 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
llvm.br ^bb6(%12 : i64) | |
^bb6(%84: i64): // 2 preds: ^bb5, ^bb7 | |
%85 = llvm.icmp "slt" %84, %14 : i64 | |
llvm.cond_br %85, ^bb7, ^bb8 | |
^bb7: // pred: ^bb6 | |
%86 = llvm.add %44, %84 : i64 | |
%87 = llvm.getelementptr %42[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%88 = llvm.load %87 : !llvm.ptr<i32> | |
%89 = llvm.getelementptr %83[%86] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
llvm.store %88, %89 : !llvm.ptr<i32> | |
%90 = llvm.add %84, %15 : i64 | |
llvm.br ^bb6(%90 : i64) | |
^bb8: // pred: ^bb6 | |
%91 = llvm.bitcast %83 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%92 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
llvm.br ^bb9(%12, %92 : i64, vector<4xi32>) | |
^bb9(%93: i64, %94: vector<4xi32>): // 2 preds: ^bb8, ^bb10 | |
%95 = llvm.icmp "slt" %93, %16 : i64 | |
llvm.cond_br %95, ^bb10, ^bb11 | |
^bb10: // pred: ^bb9 | |
%96 = llvm.mul %12, %6 : i64 | |
%97 = llvm.add %78, %96 : i64 | |
%98 = llvm.add %97, %96 : i64 | |
%99 = llvm.mul %93, %14 : i64 | |
%100 = llvm.add %98, %99 : i64 | |
%101 = llvm.add %100, %12 : i64 | |
%102 = llvm.getelementptr %20[%101] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%103 = llvm.bitcast %102 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
%104 = llvm.load %103 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%105 = llvm.add %104, %94 : vector<4xi32> | |
%106 = llvm.add %93, %15 : i64 | |
llvm.br ^bb9(%106, %105 : i64, vector<4xi32>) | |
^bb11: // pred: ^bb9 | |
llvm.store %94, %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%107 = llvm.load %91 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%108 = llvm.getelementptr %28[%82] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%109 = llvm.load %108 : !llvm.ptr<i32> | |
%110 = llvm.mlir.undef : vector<1xi32> | |
%111 = llvm.insertelement %109, %110[%0 : i32] : vector<1xi32> | |
%112 = llvm.extractelement %111[%12 : i64] : vector<1xi32> | |
%113 = "llvm.intr.vector.reduce.add"(%107) : (vector<4xi32>) -> i32 | |
%114 = llvm.add %112, %113 : i32 | |
%115 = llvm.insertelement %114, %13[%12 : i64] : vector<1xi32> | |
%116 = llvm.extractelement %115[%10 : i64] : vector<1xi32> | |
%117 = llvm.insertelement %116, %110[%0 : i32] : vector<1xi32> | |
%118 = llvm.extractelement %117[%12 : i64] : vector<1xi32> | |
llvm.store %118, %108 : !llvm.ptr<i32> | |
%119 = llvm.add %72, %15 : i64 | |
llvm.br ^bb4(%119 : i64) | |
^bb12: // pred: ^bb4 | |
%120 = llvm.add %60, %14 : i64 | |
llvm.br ^bb2(%120 : i64) | |
^bb13: // pred: ^bb2 | |
%121 = llvm.add %58, %15 : i64 | |
llvm.br ^bb1(%121 : i64) | |
^bb14: // pred: ^bb1 | |
llvm.return %0 : i32 | |
} | |
} | |
} | |
} | |
func.func private @_split_reduction_pass2() { | |
%c1 = arith.constant 1 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c128_i32 = arith.constant 128 : i32 | |
%c1_i32 = arith.constant 1 : i32 | |
%c67108864 = arith.constant 67108864 : index | |
%c524288 = arith.constant 524288 : index | |
%c0 = arith.constant 0 : index | |
%c1048576 = arith.constant 1048576 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c-1_i32 = arith.constant -1 : i32 | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c268435488_i32 = arith.constant 268435488 : i32 | |
%_device_query_0 = util.global.load @_device_query_0 : i1 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
%device = hal.ex.shared_device : !hal.device | |
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864} | |
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer | |
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
%1 = util.do_not_optimize(%buffer) : !hal.buffer | |
%len = hal.buffer.length<%1 : !hal.buffer> : index | |
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576} | |
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer | |
cf.cond_br %_device_query_0, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%1 : !hal.buffer)[%c0, %len], | |
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576] | |
]) | |
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1]) | |
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32) | |
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None") | |
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer> | |
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1]) | |
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status_3, "failed to wait on timepoint" | |
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view | |
check.expect_eq(%view, %view_4) : !hal.buffer_view | |
return | |
^bb2: // pred: ^bb0 | |
util.unreachable "device not supported in the compiled configuration" | |
} | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- // | |
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert> | |
module attributes {hal.device.targets = [#device_target_llvm_cpu]} { | |
util.global private @_device_query_0 : i1 | |
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global private @_pipeline_layout_0 : !hal.pipeline_layout | |
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer { | |
%device = hal.ex.shared_device : !hal.device | |
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false | |
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout | |
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout | |
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout | |
util.global.store %value, @_device_query_0 : i1 | |
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout | |
cf.cond_br %value, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout | |
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable | |
cf.br ^bb3(%exe : !hal.executable) | |
^bb2: // pred: ^bb0 | |
%0 = util.null : !hal.executable | |
cf.br ^bb3(%0 : !hal.executable) | |
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2 | |
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable | |
util.initializer.return | |
} | |
func.func @split_reduction_pass2() attributes {iree.abi.stub} { | |
call @_split_reduction_pass2() : () -> () | |
return | |
} | |
hal.executable private @_split_reduction_pass2_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ { | |
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%c1 = arith.constant 1 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
hal.return %c8, %c16, %c1 : index, index, index | |
} | |
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} { | |
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} { | |
%0 = llvm.mlir.constant(0 : i32) : i32 | |
%1 = llvm.mlir.constant(32768 : i64) : i64 | |
%2 = llvm.mlir.constant(128 : i64) : i64 | |
%3 = llvm.mlir.constant(256 : i64) : i64 | |
%4 = llvm.mlir.constant(1 : i64) : i64 | |
%5 = llvm.mlir.constant(63 : index) : i64 | |
%6 = llvm.mlir.constant(128 : index) : i64 | |
%7 = llvm.mlir.constant(256 : index) : i64 | |
%8 = llvm.mlir.constant(32768 : index) : i64 | |
%9 = llvm.mlir.constant(512 : index) : i64 | |
%10 = llvm.mlir.constant(0 : i64) : i64 | |
%11 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32> | |
%12 = llvm.mlir.constant(0 : index) : i64 | |
%13 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32> | |
%14 = llvm.mlir.constant(4 : index) : i64 | |
%15 = llvm.mlir.constant(1 : index) : i64 | |
%16 = llvm.mlir.constant(32 : index) : i64 | |
%17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%19 = llvm.load %18 : !llvm.ptr<ptr<i8>> | |
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64 | |
%22 = llvm.and %21, %5 : i64 | |
%23 = llvm.icmp "eq" %22, %12 : i64 | |
"llvm.intr.assume"(%23) : (i1) -> () | |
%24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> | |
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)> | |
%26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>> | |
%27 = llvm.load %26 : !llvm.ptr<ptr<i8>> | |
%28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32> | |
%29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64 | |
%30 = llvm.and %29, %5 : i64 | |
%31 = llvm.icmp "eq" %30, %12 : i64 | |
"llvm.intr.assume"(%31) : (i1) -> () | |
%32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> | |
%33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%34 = llvm.zext %33 : i32 to i64 | |
%35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)> | |
%36 = llvm.zext %35 : i32 to i64 | |
%37 = llvm.mul %36, %16 : i64 | |
%38 = llvm.mul %34, %16 : i64 | |
%39 = llvm.mlir.null : !llvm.ptr<i32> | |
%40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32> | |
%41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64 | |
%42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32> | |
%43 = llvm.mul %12, %14 : i64 | |
%44 = llvm.add %43, %43 : i64 | |
%45 = llvm.add %44, %12 : i64 | |
%46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32> | |
%47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>> | |
llvm.store %11, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>> | |
%48 = llvm.mul %37, %7 : i64 | |
%49 = llvm.add %48, %12 : i64 | |
%50 = llvm.mul %38, %15 : i64 | |
%51 = llvm.add %49, %50 : i64 | |
%52 = llvm.mul %37, %8 : i64 | |
%53 = llvm.add %52, %12 : i64 | |
%54 = llvm.mul %38, %6 : i64 | |
%55 = llvm.add %53, %54 : i64 | |
%56 = llvm.mul %10, %15 : i64 | |
%57 = llvm.add %55, %56 : i64 | |
llvm.br ^bb1(%12 : i64) | |
^bb1(%58: i64): // 2 preds: ^bb0, ^bb13 | |
%59 = llvm.icmp "slt" %58, %16 : i64 | |
llvm.cond_br %59, ^bb2(%12 : i64), ^bb14 | |
^bb2(%60: i64): // 2 preds: ^bb1, ^bb12 | |
%61 = llvm.icmp "slt" %60, %16 : i64 | |
llvm.cond_br %61, ^bb3, ^bb13 | |
^bb3: // pred: ^bb2 | |
%62 = llvm.mul %58, %1 : i64 | |
%63 = llvm.add %57, %62 : i64 | |
%64 = llvm.mul %60, %2 : i64 | |
%65 = llvm.add %63, %64 : i64 | |
%66 = llvm.mul %10, %4 : i64 | |
%67 = llvm.add %65, %66 : i64 | |
%68 = llvm.mul %58, %3 : i64 | |
%69 = llvm.add %51, %68 : i64 | |
%70 = llvm.mul %60, %4 : i64 | |
%71 = llvm.add %69, %70 : i64 | |
llvm.br ^bb4(%12 : i64) | |
^bb4(%72: i64): // 2 preds: ^bb3, ^bb11 | |
%73 = llvm.icmp "slt" %72, %14 : i64 | |
llvm.cond_br %73, ^bb5, ^bb12 | |
^bb5: // pred: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment