Skip to content

Instantly share code, notes, and snippets.

@vmurali
Created September 27, 2022 22:57
Show Gist options
  • Save vmurali/645daada9b21c253469018fa50a8a760 to your computer and use it in GitHub Desktop.
Save vmurali/645daada9b21c253469018fa50a8a760 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
func.func private @_split_reduction_pass2() {
%0 = util.unfoldable_constant dense<1> : tensor<512x256x128xi32>
%c0_i32 = arith.constant 0 : i32
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%2 = linalg.fill ins(%c0_i32 : i32) outs(%1 : tensor<512x256xi32>) -> tensor<512x256xi32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%4 = arith.addi %arg0, %arg1 : i32
linalg.yield %4 : i32
} -> tensor<512x256xi32>
check.expect_eq_const(%3, dense<128> : tensor<512x256xi32>) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
func.func private @_split_reduction_pass2() {
%cst = arith.constant dense<128> : tensor<512x256xi32>
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%3 = arith.addi %arg0, %arg1 : i32
linalg.yield %3 : i32
} -> tensor<512x256xi32>
check.expect_eq(%2, %cst) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Inliner (inline) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
func.func private @_split_reduction_pass2() {
%cst = arith.constant dense<128> : tensor<512x256xi32>
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%3 = arith.addi %arg0, %arg1 : i32
linalg.yield %3 : i32
} -> tensor<512x256xi32>
check.expect_eq(%2, %cst) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After DispatchLinalgOnTensors (iree-flow-dispatch-linalg-on-tensors-pass) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
func.func private @_split_reduction_pass2() {
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<128> : tensor<512x256xi32>
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
%1 = flow.dispatch.workgroups[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32> =
(%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) {
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%1, %cst) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
flow.executable private @_split_reduction_pass2_dispatch_0 {
flow.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%3 = arith.addi %arg2, %arg3 : i32
linalg.yield %3 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %2, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<128> : tensor<512x256xi32>
%cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
%0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
%1 = flow.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32>
check.expect_eq(%1, %cst) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ConvertToStream (iree-stream-conversion) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%cst = stream.tensor.constant : tensor<512x256xi32> in !stream.resource<constant> = dense<128> : tensor<512x256xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
%cst_0 = stream.tensor.constant : tensor<512x256x128xi32> in !stream.resource<constant> = dense<1> : tensor<512x256x128xi32>
%2 = stream.resource.size %cst_0 : !stream.resource<constant>
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2}
%4 = util.do_not_optimize(%3) : !stream.resource<*>
%5 = stream.resource.size %4 : !stream.resource<*>
%6 = stream.tensor.sizeof tensor<512x256xi32> : index
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32>
%10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
check.expect_eq(%9, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<512x256xi32> : index
%1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<512x256x128xi32> : index
%3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2}
%4 = util.do_not_optimize(%3) : !stream.resource<*>
%5 = stream.resource.size %4 : !stream.resource<*>
%6 = stream.tensor.sizeof tensor<512x256xi32> : index
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6}
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32>
%10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
check.expect_eq(%9, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<512x256xi32> : index
%1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<512x256x128xi32> : index
%3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2}
%4 = util.do_not_optimize(%3) : !stream.resource<*>
%5 = stream.resource.size %4 : !stream.resource<*>
%6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%0}
%7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
%9 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
check.expect_eq(%8, %10) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288}
%1 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864}
%2 = util.do_not_optimize(%1) : !stream.resource<*>
%3 = stream.resource.size %2 : !stream.resource<*>
%4 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%2) : (!stream.resource<*>{%3}) -> !stream.resource<*>{%c524288}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
%6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%7 = stream.async.transfer %0 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%6, %8) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<*>
%2 = stream.resource.size %1 : !stream.resource<*>
%3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<*>{%2}) -> !stream.resource<*>{%c524288}
%4 = stream.async.transfer %3 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
%5 = stream.tensor.export %4 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%6 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
%8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%5, %8) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After RefineUsage (iree-stream-refine-usage) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<transient>
%2 = stream.resource.size %1 : !stream.resource<transient>
%3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
%4 = stream.tensor.export %3 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%5 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
%6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%4, %6) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
stream.yield %6 : !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<transient>
%2 = stream.resource.size %1 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg0) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
%7 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
stream.yield %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
} => !stream.timepoint
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%4, %5) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
stream.yield %6 : !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<transient>
%2 = stream.resource.size %1 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
%8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
}
stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
} => !stream.timepoint
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%4, %5) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
%8 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
stream.yield %8 : !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<transient>
%2 = stream.resource.size %1 : !stream.resource<transient>
%3 = stream.timepoint.immediate => !stream.timepoint
%4 = stream.timepoint.immediate => !stream.timepoint
%results_0:2, %result_timepoint_1 = stream.async.execute await(%4) => with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%9 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
%10 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
stream.yield %9, %10 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%6 = stream.tensor.export %5#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%7 = stream.tensor.export %5#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%6, %7) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
%6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
stream.yield %6 : !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
%1 = util.do_not_optimize(%0) : !stream.resource<transient>
%2 = stream.resource.size %1 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
%7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
%8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
}
stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
} => !stream.timepoint
%3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%4, %5) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%c0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%c0_0 = arith.constant 0 : index
%5:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5#0 as %arg1: !stream.resource<external>{%c524288}, %5#1 as %arg2: !stream.resource<external>{%c524288}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
}
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
}
} => !stream.timepoint
%7:2 = stream.timepoint.await %6 => %5#0, %5#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%8 = stream.tensor.export %7#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%9 = stream.tensor.export %7#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%8, %9) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%c0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%c0_0 = arith.constant 0 : index
%5:3 = stream.resource.pack slices({
[0, 0] = %c524288,
[0, 0] = %c524288
}) : index
%6 = stream.resource.alloc uninitialized : !stream.resource<external>{%5#0}
%7 = stream.resource.subview %6[%5#1] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288}
%8 = stream.resource.subview %6[%5#2] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288}
%9 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %7 as %arg1: !stream.resource<external>{%c524288}, %8 as %arg2: !stream.resource<external>{%c524288}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
}
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
}
} => !stream.timepoint
%10:2 = stream.timepoint.await %9 => %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%11 = stream.tensor.export %10#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%12 = stream.tensor.export %10#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%11, %12) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%c0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%c0_0 = arith.constant 0 : index
%c0_1 = arith.constant 0 : index
%c524288_2 = arith.constant 524288 : index
%c524288_3 = arith.constant 524288 : index
%c1048576 = arith.constant 1048576 : index
%c1048576_4 = arith.constant 1048576 : index
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_4}
%6 = stream.resource.subview %5[%c0_1] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288}
%7 = stream.resource.subview %5[%c524288_3] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
}
stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
}
} => !stream.timepoint
%9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0_0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%c0_1 = arith.constant 0 : index
%c0_2 = arith.constant 0 : index
%c524288_3 = arith.constant 524288 : index
%c524288_4 = arith.constant 524288 : index
%c1048576 = arith.constant 1048576 : index
%c1048576_5 = arith.constant 1048576 : index
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_5}
%6 = stream.resource.subview %5[%c0_2] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288}
%7 = stream.resource.subview %5[%c524288_4] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0_1 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_1 for %c524288] : !stream.resource<external>{%c524288}
}
stream.cmd.fill %c128_i32, %arg2[%c0_1 for %c524288] : i32 -> !stream.resource<external>{%c524288}
}
} => !stream.timepoint
%9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
%10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c524288] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index) {
%c0 = arith.constant 0 : index
%0 = arith.addi %c0, %arg2 : index
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = arith.addi %c0, %arg3 : index
%3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%5 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%6 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<512x256x128xi32>) outs(%5 : tensor<512x256xi32>) {
^bb0(%arg4: i32, %arg5: i32):
%7 = arith.addi %arg4, %arg5 : i32
linalg.yield %7 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %6, %3, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%c0_0 = arith.constant 0 : index
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0, %c0 : index, index) {
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) {
%0 = arith.index_cast %arg2 : i32 to index
%1 = arith.index_cast %arg3 : i32 to index
%c0 = arith.constant 0 : index
%2 = arith.addi %c0, %0 : index
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%4 = arith.addi %c0, %1 : index
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
^bb0(%arg4: i32, %arg5: i32):
%9 = arith.addi %arg4, %arg5 : i32
linalg.yield %9 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%c0_0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c0_i32_1 = arith.constant 0 : i32
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32_1 : i32, i32) {
ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) {
%0 = arith.index_cast %arg2 : i32 to index
%1 = arith.index_cast %arg3 : i32 to index
%c0 = arith.constant 0 : index
%2 = arith.addi %c0, %0 : index
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%4 = arith.addi %c0, %1 : index
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
^bb0(%arg4: i32, %arg5: i32):
%9 = arith.addi %arg4, %arg5 : i32
linalg.yield %9 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%c0_i32 = arith.constant 0 : i32
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32 : i32, i32) {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%c0 = arith.constant 0 : index
%2 = arith.addi %c0, %0 : index
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%4 = arith.addi %c0, %1 : index
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%9 = arith.addi %arg2, %arg3 : i32
linalg.yield %9 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%c0_i32 = arith.constant 0 : i32
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%c0 = arith.constant 0 : index
%2 = arith.addi %c0, %0 : index
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%4 = arith.addi %c0, %1 : index
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%9 = arith.addi %arg2, %arg3 : i32
linalg.yield %9 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%c0_i32 = arith.constant 0 : i32
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%c0 = arith.constant 0 : index
%2 = arith.addi %c0, %0 : index
%3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%4 = arith.addi %c0, %1 : index
%5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%9 = arith.addi %arg2, %arg3 : i32
linalg.yield %9 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
stream.executable private @_split_reduction_pass2_dispatch_0 {
stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg2: i32, %arg3: i32):
%5 = arith.addi %arg2, %arg3 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
%3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
%4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
^bb0(%arg0: i32, %arg1: i32):
%5 = arith.addi %arg0, %arg1 : i32
linalg.yield %5 : i32
} -> tensor<512x256xi32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %2 to %c512 step %3 {
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
%7 = linalg.init_tensor [32, 32] : tensor<32x32xi32>
%8 = tensor.cast %6 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
%9 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} {
^bb0(%arg2: i32, %arg3: i32):
%11 = arith.addi %arg2, %arg3 : i32
linalg.yield %11 : i32
} -> tensor<32x32xi32>
%10 = tensor.cast %9 : tensor<32x32xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %2 to %c512 step %3 {
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %4 to %c256 step %5 {
%c32_0 = arith.constant 32 : index
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32_0, %c32_0], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32>
%7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
%9 = linalg.init_tensor [32, 32] : tensor<32x32xi32>
%10 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
%11 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} {
^bb0(%arg2: i32, %arg3: i32):
%13 = arith.addi %arg2, %arg3 : i32
linalg.yield %13 : i32
} -> tensor<32x32xi32>
%12 = tensor.cast %11 : tensor<32x32xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %2 to %c512 step %3 {
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32>
%7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
%9 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
%10 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs = {lowering_config = #config} {
^bb0(%arg2: i32, %arg3: i32):
%12 = arith.addi %arg2, %arg3 : i32
linalg.yield %12 : i32
} -> tensor<32x32xi32>
%11 = tensor.cast %10 : tensor<32x32xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %11, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %2 to %c512 step %3 {
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<32x32x128xi32>) outs(%6 : tensor<32x32xi32>) attrs = {lowering_config = #config} {
^bb0(%arg2: i32, %arg3: i32):
%9 = arith.addi %arg2, %arg3 : i32
linalg.yield %9 : i32
} -> tensor<32x32xi32>
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %2 to %c512 step %3 {
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {__internal_linalg_transform__ = "1", lowering_config = #config} {
^bb0(%arg6: i32, %arg7: i32):
%14 = arith.addi %arg6, %arg7 : i32
linalg.yield %14 : i32
} -> tensor<1x4xi32>
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {lowering_config = #config} {
^bb0(%arg6: i32, %arg7: i32):
%14 = arith.addi %arg6, %arg7 : i32
linalg.yield %14 : i32
} -> tensor<1x4xi32>
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs = {lowering_config = #config} {
^bb0(%arg6: i32, %arg7: i32):
%14 = arith.addi %arg6, %arg7 : i32
linalg.yield %14 : i32
} -> tensor<1x4xi32>
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgSplitReduction (linalg-split-reduction) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map0()[%workgroup_id_y]
%3 = affine.apply #map0()[%workgroup_count_y]
%4 = affine.apply #map0()[%workgroup_id_x]
%5 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
%14 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
%15 = tensor.extract_slice %10[%arg6, %arg8, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x1x128xi32>
%16 = tensor.extract_slice %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<1x1xi32>
%17 = tensor.expand_shape %15 [[0], [1], [2, 3]] : tensor<1x1x128xi32> into tensor<1x1x32x4xi32>
%18 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
%19 = linalg.fill ins(%c0_i32 : i32) outs(%18 : tensor<1x1x4xi32>) -> tensor<1x1x4xi32>
%20 = scf.for %arg10 = %c0 to %c32 step %c1 iter_args(%arg11 = %19) -> (tensor<1x1x4xi32>) {
%23 = tensor.extract_slice %17[0, 0, %arg10, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x32x4xi32> to tensor<1x1x1x4xi32>
%24 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%23 : tensor<1x1x1x4xi32>) outs(%arg11 : tensor<1x1x4xi32>) {
^bb0(%arg12: i32, %arg13: i32):
%25 = arith.addi %arg12, %arg13 : i32
linalg.yield %25 : i32
} -> tensor<1x1x4xi32>
scf.yield %24 : tensor<1x1x4xi32>
}
%21 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%20 : tensor<1x1x4xi32>) outs(%16 : tensor<1x1xi32>) {
^bb0(%arg10: i32, %arg11: i32):
%23 = arith.addi %arg10, %arg11 : i32
linalg.yield %23 : i32
} -> tensor<1x1xi32>
%22 = tensor.insert_slice %21 into %arg9[%arg6, %arg8] [1, 1] [1, 1] : tensor<1x1xi32> into tensor<1x4xi32>
scf.yield %22 : tensor<1x4xi32>
}
scf.yield %14 : tensor<1x4xi32>
}
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyVectorizePass (iree-linalg-strategy-vectorize-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_count_y]
%4 = affine.apply #map()[%workgroup_id_x]
%5 = affine.apply #map()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
%14 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
%15 = tensor.extract_slice %10[%arg6, %arg8, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x1x128xi32>
%16 = tensor.expand_shape %15 [[0], [1], [2, 3]] : tensor<1x1x128xi32> into tensor<1x1x32x4xi32>
%17 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
%18 = vector.transfer_write %cst, %17[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
%19 = scf.for %arg10 = %c0 to %c32 step %c1 iter_args(%arg11 = %18) -> (tensor<1x1x4xi32>) {
%24 = vector.transfer_read %16[%c0, %c0, %arg10, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : tensor<1x1x32x4xi32>, vector<1x1x1x4xi32>
%25 = vector.transfer_read %arg11[%c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true]} : tensor<1x1x4xi32>, vector<1x1x4xi32>
%26 = vector.multi_reduction <add>, %24, %25 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
%27 = vector.transfer_write %26, %arg11[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
scf.yield %27 : tensor<1x1x4xi32>
}
%20 = vector.transfer_read %19[%c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true]} : tensor<1x1x4xi32>, vector<1x1x4xi32>
%21 = vector.transfer_read %arg9[%arg6, %arg8], %c0_i32 {in_bounds = [true, true]} : tensor<1x4xi32>, vector<1x1xi32>
%22 = vector.multi_reduction <add>, %20, %21 [2] : vector<1x1x4xi32> to vector<1x1xi32>
%23 = vector.transfer_write %22, %arg9[%arg6, %arg8] {in_bounds = [true, true]} : vector<1x1xi32>, tensor<1x4xi32>
scf.yield %23 : tensor<1x4xi32>
}
scf.yield %14 : tensor<1x4xi32>
}
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_count_y]
%4 = affine.apply #map()[%workgroup_id_x]
%5 = affine.apply #map()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
%14 = tensor.extract_slice %10[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x1x128xi32>
%15 = tensor.expand_shape %14 [[0], [1], [2, 3]] : tensor<1x1x128xi32> into tensor<1x1x32x4xi32>
%16 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %cst) -> (vector<1x1x4xi32>) {
%20 = vector.transfer_read %15[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : tensor<1x1x32x4xi32>, vector<1x1x1x4xi32>
%21 = vector.multi_reduction <add>, %20, %arg9 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %21 : vector<1x1x4xi32>
}
%17 = vector.transfer_read %arg7[%c0, %arg6], %c0_i32 {in_bounds = [true, true]} : tensor<1x4xi32>, vector<1x1xi32>
%18 = vector.multi_reduction <add>, %16, %17 [2] : vector<1x1x4xi32> to vector<1x1xi32>
%19 = vector.transfer_write %18, %arg7[%c0, %arg6] {in_bounds = [true, true]} : vector<1x1xi32>, tensor<1x4xi32>
scf.yield %19 : tensor<1x4xi32>
}
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_count_y]
%4 = affine.apply #map()[%workgroup_id_x]
%5 = affine.apply #map()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
%8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
%9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
%10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
%11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
%12 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
%14 = tensor.extract_slice %10[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x1x128xi32>
%15 = tensor.expand_shape %14 [[0], [1], [2, 3]] : tensor<1x1x128xi32> into tensor<1x1x32x4xi32>
%16 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %cst) -> (vector<1x1x4xi32>) {
%20 = vector.transfer_read %15[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : tensor<1x1x32x4xi32>, vector<1x1x1x4xi32>
%21 = vector.multi_reduction <add>, %20, %arg9 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %21 : vector<1x1x4xi32>
}
%17 = vector.transfer_read %arg7[%c0, %arg6], %c0_i32 {in_bounds = [true, true]} : tensor<1x4xi32>, vector<1x1xi32>
%18 = vector.multi_reduction <add>, %16, %17 [2] : vector<1x1x4xi32> to vector<1x1xi32>
%19 = vector.transfer_write %18, %arg7[%c0, %arg6] {in_bounds = [true, true]} : vector<1x1xi32>, tensor<1x4xi32>
scf.yield %19 : tensor<1x4xi32>
}
%13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
scf.yield %13 : tensor<32x32xi32>
}
scf.yield %9 : tensor<32x32xi32>
}
flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %2, 64 : memref<512x256xi32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map0()[%workgroup_id_y]
%5 = affine.apply #map0()[%workgroup_count_y]
%6 = affine.apply #map0()[%workgroup_id_x]
%7 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %4 to %c512 step %5 {
scf.for %arg1 = %6 to %c256 step %7 {
%8 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%9 = bufferization.to_tensor %8 : memref<32x32xi32, strided<[256, 1], offset: ?>>
%10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = bufferization.to_tensor %10 : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
%12 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %8) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
%15 = bufferization.to_tensor %arg3 : memref<32x32xi32, strided<[256, 1], offset: ?>>
%16 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
%18 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
%19 = memref.subview %10[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%20 = bufferization.to_tensor %19 : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%21 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
%22 = bufferization.to_tensor %21 : memref<1x4xi32, strided<[256, 1], offset: ?>>
%23 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %21) -> (memref<1x4xi32, strided<[256, 1], offset: ?>>) {
%27 = bufferization.to_tensor %arg7 : memref<1x4xi32, strided<[256, 1], offset: ?>>
%28 = memref.subview %19[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%29 = bufferization.to_tensor %28 : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%30 = memref.expand_shape %28 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%31 = bufferization.to_tensor %30 : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%32 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %cst) -> (vector<1x1x4xi32>) {
%36 = vector.transfer_read %30[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%37 = vector.multi_reduction <add>, %36, %arg9 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %37 : vector<1x1x4xi32>
}
%33 = vector.transfer_read %arg7[%c0, %arg6], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%34 = vector.multi_reduction <add>, %32, %33 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %34, %arg7[%c0, %arg6] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
%35 = bufferization.to_tensor %arg7 : memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.yield %arg7 : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
%24 = bufferization.to_tensor %23 : memref<1x4xi32, strided<[256, 1], offset: ?>>
%25 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%23 : memref<1x4xi32, strided<[256, 1], offset: ?>>) outs(%25 : memref<1x4xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg6: i32, %arg7: i32):
linalg.yield %arg6 : i32
}
%26 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
}
%17 = bufferization.to_tensor %16 : memref<32x32xi32, strided<[256, 1], offset: ?>>
scf.yield %16 : memref<32x32xi32, strided<[256, 1], offset: ?>>
}
%13 = bufferization.to_tensor %12 : memref<32x32xi32, strided<[256, 1], offset: ?>>
%14 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%12 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%14 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg2: i32, %arg3: i32):
linalg.yield %arg2 : i32
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %2, 64 : memref<512x256xi32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map0()[%workgroup_id_y]
%5 = affine.apply #map0()[%workgroup_count_y]
%6 = affine.apply #map0()[%workgroup_id_x]
%7 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %4 to %c512 step %5 {
scf.for %arg1 = %6 to %c256 step %7 {
%8 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%9 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
%10 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %8) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
%12 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
%13 = memref.subview %9[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%14 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
%15 = scf.for %arg6 = %c0 to %c4 step %c1 iter_args(%arg7 = %14) -> (memref<1x4xi32, strided<[256, 1], offset: ?>>) {
%17 = memref.subview %13[0, %arg6, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%18 = memref.expand_shape %17 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%19 = scf.for %arg8 = %c0 to %c32 step %c1 iter_args(%arg9 = %cst) -> (vector<1x1x4xi32>) {
%22 = vector.transfer_read %18[%c0, %c0, %arg8, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%23 = vector.multi_reduction <add>, %22, %arg9 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %23 : vector<1x1x4xi32>
}
%20 = vector.transfer_read %arg7[%c0, %arg6], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%21 = vector.multi_reduction <add>, %19, %20 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %21, %arg7[%c0, %arg6] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.yield %arg7 : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
%16 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<1x4xi32, strided<[256, 1], offset: ?>>) outs(%16 : memref<1x4xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg6: i32, %arg7: i32):
linalg.yield %arg6 : i32
}
scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
}
scf.yield %12 : memref<32x32xi32, strided<[256, 1], offset: ?>>
}
%11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%10 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg2: i32, %arg3: i32):
linalg.yield %arg2 : i32
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %2, 64 : memref<512x256xi32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map0()[%workgroup_id_y]
%5 = affine.apply #map0()[%workgroup_count_y]
%6 = affine.apply #map0()[%workgroup_id_x]
%7 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %4 to %c512 step %5 {
scf.for %arg1 = %6 to %c256 step %7 {
%8 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%9 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg2 = %c0 to %c32 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c4 {
%11 = memref.subview %9[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%12 = memref.subview %8[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg4 = %c0 to %c4 step %c1 {
%14 = memref.subview %11[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%15 = memref.expand_shape %14 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%16 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %cst) -> (vector<1x1x4xi32>) {
%19 = vector.transfer_read %15[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%20 = vector.multi_reduction <add>, %19, %arg6 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %20 : vector<1x1x4xi32>
}
%17 = vector.transfer_read %12[%c0, %arg4], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%18 = vector.multi_reduction <add>, %16, %17 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %18, %12[%c0, %arg4] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
%13 = memref.subview %8[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%12 : memref<1x4xi32, strided<[256, 1], offset: ?>>) outs(%13 : memref<1x4xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg4: i32, %arg5: i32):
linalg.yield %arg4 : i32
}
}
}
%10 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%10 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg2: i32, %arg3: i32):
linalg.yield %arg2 : i32
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %2, 64 : memref<512x256xi32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map0()[%workgroup_id_y]
%5 = affine.apply #map0()[%workgroup_count_y]
%6 = affine.apply #map0()[%workgroup_id_x]
%7 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg0 = %4 to %c512 step %5 {
scf.for %arg1 = %6 to %c256 step %7 {
%8 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%9 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg2 = %c0 to %c32 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c4 {
%10 = memref.subview %9[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = memref.subview %8[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg4 = %c0 to %c4 step %c1 {
%12 = memref.subview %10[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%13 = memref.expand_shape %12 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%14 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %cst) -> (vector<1x1x4xi32>) {
%17 = vector.transfer_read %13[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%18 = vector.multi_reduction <add>, %17, %arg6 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %18 : vector<1x1x4xi32>
}
%15 = vector.transfer_read %11[%c0, %arg4], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%16 = vector.multi_reduction <add>, %14, %15 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %16, %11[%c0, %arg4] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<1x4xi32, strided<[256, 1], offset: ?>>) outs(%11 : memref<1x4xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg4: i32, %arg5: i32):
linalg.yield %arg4 : i32
}
}
}
linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%8 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
^bb0(%arg2: i32, %arg3: i32):
linalg.yield %arg2 : i32
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %2, 64 : memref<512x256xi32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map()[%workgroup_id_y]
%5 = affine.apply #map()[%workgroup_count_y]
%6 = affine.apply #map()[%workgroup_id_x]
%7 = affine.apply #map()[%workgroup_count_x]
scf.for %arg0 = %4 to %c512 step %5 {
scf.for %arg1 = %6 to %c256 step %7 {
%8 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%9 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg2 = %c0 to %c32 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c4 {
%10 = memref.subview %9[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = memref.subview %8[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg4 = %c0 to %c4 step %c1 {
%12 = memref.subview %10[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%13 = memref.expand_shape %12 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%14 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %cst) -> (vector<1x1x4xi32>) {
%17 = vector.transfer_read %13[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%18 = vector.multi_reduction <add>, %17, %arg6 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %18 : vector<1x1x4xi32>
}
%15 = vector.transfer_read %11[%c0, %arg4], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%16 = vector.multi_reduction <add>, %14, %15 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %16, %11[%c0, %arg4] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_count_y]
%4 = affine.apply #map()[%workgroup_id_x]
%5 = affine.apply #map()[%workgroup_count_x]
scf.for %arg0 = %2 to %c512 step %3 {
scf.for %arg1 = %4 to %c256 step %5 {
%6 = memref.subview %1[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%7 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg2 = %c0 to %c32 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c4 {
%8 = memref.subview %7[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.subview %6[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg4 = %c0 to %c4 step %c1 {
%10 = memref.subview %8[0, %arg4, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = memref.expand_shape %10 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%12 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %cst) -> (vector<1x1x4xi32>) {
%15 = vector.transfer_read %11[%c0, %c0, %arg5, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%16 = vector.multi_reduction <add>, %15, %arg6 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %16 : vector<1x1x4xi32>
}
%13 = vector.transfer_read %9[%c0, %arg4], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%14 = vector.multi_reduction <add>, %12, %13 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %14, %9[%c0, %arg4] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst) -> (vector<1x1x4xi32>) {
%13 = vector.transfer_read %9[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%14 = vector.multi_reduction <add>, %13, %arg4 [2] : vector<1x1x1x4xi32> to vector<1x1x4xi32>
scf.yield %14 : vector<1x1x4xi32>
}
%11 = vector.transfer_read %7[%c0, %arg2], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%12 = vector.multi_reduction <add>, %10, %11 [2] : vector<1x1x4xi32> to vector<1x1xi32>
vector.transfer_write %12, %7[%c0, %arg2] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%17 = vector.transfer_read %9[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%18 = vector.transpose %17, [0, 1, 3, 2] : vector<1x1x1x4xi32> to vector<1x1x4x1xi32>
%19 = vector.extract %18[0, 0, 0] : vector<1x1x4x1xi32>
%20 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%21 = vector.reduction <add>, %19, %20 : vector<1xi32> into i32
%22 = vector.insertelement %21, %cst[%c0 : index] : vector<4xi32>
%23 = vector.extract %18[0, 0, 1] : vector<1x1x4x1xi32>
%24 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%25 = vector.reduction <add>, %23, %24 : vector<1xi32> into i32
%26 = vector.insertelement %25, %22[%c1 : index] : vector<4xi32>
%27 = vector.extract %18[0, 0, 2] : vector<1x1x4x1xi32>
%28 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%29 = vector.reduction <add>, %27, %28 : vector<1xi32> into i32
%30 = vector.insertelement %29, %26[%c2 : index] : vector<4xi32>
%31 = vector.extract %18[0, 0, 3] : vector<1x1x4x1xi32>
%32 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%33 = vector.reduction <add>, %31, %32 : vector<1xi32> into i32
%34 = vector.insertelement %33, %30[%c3 : index] : vector<4xi32>
%35 = vector.shape_cast %34 : vector<4xi32> to vector<1x1x4xi32>
scf.yield %35 : vector<1x1x4xi32>
}
%11 = vector.transfer_read %7[%c0, %arg2], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.extract %11[0, 0] : vector<1x1xi32>
%14 = vector.reduction <add>, %12, %13 : vector<4xi32> into i32
%15 = vector.insertelement %14, %cst_0[%c0 : index] : vector<1xi32>
%16 = vector.shape_cast %15 : vector<1xi32> to vector<1x1xi32>
vector.transfer_write %16, %7[%c0, %arg2] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%17 = vector.transfer_read %9[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%18 = vector.transpose %17, [0, 1, 3, 2] : vector<1x1x1x4xi32> to vector<1x1x4x1xi32>
%19 = vector.extract %18[0, 0, 0] : vector<1x1x4x1xi32>
%20 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%21 = vector.reduction <add>, %19, %20 : vector<1xi32> into i32
%22 = vector.insertelement %21, %cst[%c0 : index] : vector<4xi32>
%23 = vector.extract %18[0, 0, 1] : vector<1x1x4x1xi32>
%24 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%25 = vector.reduction <add>, %23, %24 : vector<1xi32> into i32
%26 = vector.insertelement %25, %22[%c1 : index] : vector<4xi32>
%27 = vector.extract %18[0, 0, 2] : vector<1x1x4x1xi32>
%28 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%29 = vector.reduction <add>, %27, %28 : vector<1xi32> into i32
%30 = vector.insertelement %29, %26[%c2 : index] : vector<4xi32>
%31 = vector.extract %18[0, 0, 3] : vector<1x1x4x1xi32>
%32 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%33 = vector.reduction <add>, %31, %32 : vector<1xi32> into i32
%34 = vector.insertelement %33, %30[%c3 : index] : vector<4xi32>
%35 = vector.shape_cast %34 : vector<4xi32> to vector<1x1x4xi32>
scf.yield %35 : vector<1x1x4xi32>
}
%11 = vector.transfer_read %7[%c0, %arg2], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.extract %11[0, 0] : vector<1x1xi32>
%14 = vector.reduction <add>, %12, %13 : vector<4xi32> into i32
%15 = vector.insertelement %14, %cst_0[%c0 : index] : vector<1xi32>
%16 = vector.shape_cast %15 : vector<1xi32> to vector<1x1xi32>
vector.transfer_write %16, %7[%c0, %arg2] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c0_i32 = arith.constant 0 : i32
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%17 = vector.transfer_read %9[%c0, %c0, %arg3, %c0], %c0_i32 {in_bounds = [true, true, true, true]} : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<1x1x1x4xi32>
%18 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%19 = vector.extract %17[0, 0, 0, 0] : vector<1x1x1x4xi32>
%20 = arith.addi %19, %18 : i32
%21 = vector.insertelement %20, %cst[%c0 : index] : vector<4xi32>
%22 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%23 = vector.extract %17[0, 0, 0, 1] : vector<1x1x1x4xi32>
%24 = arith.addi %23, %22 : i32
%25 = vector.insertelement %24, %21[%c1 : index] : vector<4xi32>
%26 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%27 = vector.extract %17[0, 0, 0, 2] : vector<1x1x1x4xi32>
%28 = arith.addi %27, %26 : i32
%29 = vector.insertelement %28, %25[%c2 : index] : vector<4xi32>
%30 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%31 = vector.extract %17[0, 0, 0, 3] : vector<1x1x1x4xi32>
%32 = arith.addi %31, %30 : i32
%33 = vector.insertelement %32, %29[%c3 : index] : vector<4xi32>
%34 = vector.shape_cast %33 : vector<4xi32> to vector<1x1x4xi32>
scf.yield %34 : vector<1x1x4xi32>
}
%11 = vector.transfer_read %7[%c0, %arg2], %c0_i32 {in_bounds = [true, true]} : memref<1x4xi32, strided<[256, 1], offset: ?>>, vector<1x1xi32>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.extract %11[0, 0] : vector<1x1xi32>
%14 = vector.reduction <add>, %12, %13 : vector<4xi32> into i32
%15 = vector.insertelement %14, %cst_0[%c0 : index] : vector<1xi32>
%16 = vector.shape_cast %15 : vector<1xi32> to vector<1x1xi32>
vector.transfer_write %16, %7[%c0, %arg2] {in_bounds = [true, true]} : vector<1x1xi32>, memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%16 = vector.load %9[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%17 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%18 = vector.extract %16[0] : vector<4xi32>
%19 = arith.addi %18, %17 : i32
%20 = vector.insertelement %19, %cst[%c0 : index] : vector<4xi32>
%21 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%22 = vector.extract %16[1] : vector<4xi32>
%23 = arith.addi %22, %21 : i32
%24 = vector.insertelement %23, %20[%c1 : index] : vector<4xi32>
%25 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%26 = vector.extract %16[2] : vector<4xi32>
%27 = arith.addi %26, %25 : i32
%28 = vector.insertelement %27, %24[%c2 : index] : vector<4xi32>
%29 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%30 = vector.extract %16[3] : vector<4xi32>
%31 = arith.addi %30, %29 : i32
%32 = vector.insertelement %31, %28[%c3 : index] : vector<4xi32>
%33 = vector.shape_cast %32 : vector<4xi32> to vector<1x1x4xi32>
scf.yield %33 : vector<1x1x4xi32>
}
%11 = memref.load %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.reduction <add>, %12, %11 : vector<4xi32> into i32
%14 = vector.insertelement %13, %cst_0[%c0 : index] : vector<1xi32>
%15 = vector.extract %14[0] : vector<1xi32>
memref.store %15, %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%16 = vector.load %9[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%17 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%18 = vector.extract %16[0] : vector<4xi32>
%19 = arith.addi %18, %17 : i32
%20 = vector.insertelement %19, %cst[%c0 : index] : vector<4xi32>
%21 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%22 = vector.extract %16[1] : vector<4xi32>
%23 = arith.addi %22, %21 : i32
%24 = vector.insertelement %23, %20[%c1 : index] : vector<4xi32>
%25 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%26 = vector.extract %16[2] : vector<4xi32>
%27 = arith.addi %26, %25 : i32
%28 = vector.insertelement %27, %24[%c2 : index] : vector<4xi32>
%29 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%30 = vector.extract %16[3] : vector<4xi32>
%31 = arith.addi %30, %29 : i32
%32 = vector.insertelement %31, %28[%c3 : index] : vector<4xi32>
%33 = vector.shape_cast %32 : vector<4xi32> to vector<1x1x4xi32>
scf.yield %33 : vector<1x1x4xi32>
}
%11 = memref.load %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.reduction <add>, %12, %11 : vector<4xi32> into i32
%14 = vector.insertelement %13, %cst_0[%c0 : index] : vector<1xi32>
%15 = vector.extract %14[0] : vector<1xi32>
memref.store %15, %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%16 = vector.load %9[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%17 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%18 = vector.extract %16[0] : vector<4xi32>
%19 = arith.addi %18, %17 : i32
%20 = vector.insertelement %19, %cst[%c0 : index] : vector<4xi32>
%21 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%22 = vector.extract %16[1] : vector<4xi32>
%23 = arith.addi %22, %21 : i32
%24 = vector.insertelement %23, %20[%c1 : index] : vector<4xi32>
%25 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%26 = vector.extract %16[2] : vector<4xi32>
%27 = arith.addi %26, %25 : i32
%28 = vector.insertelement %27, %24[%c2 : index] : vector<4xi32>
%29 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%30 = vector.extract %16[3] : vector<4xi32>
%31 = arith.addi %30, %29 : i32
%32 = vector.insertelement %31, %28[%c3 : index] : vector<4xi32>
%33 = vector.extract %32[0] : vector<4xi32>
%34 = vector.insert %33, %cst_1 [0, 0, 0] : i32 into vector<1x1x4xi32>
%35 = vector.extract %32[1] : vector<4xi32>
%36 = vector.insert %35, %34 [0, 0, 1] : i32 into vector<1x1x4xi32>
%37 = vector.extract %32[2] : vector<4xi32>
%38 = vector.insert %37, %36 [0, 0, 2] : i32 into vector<1x1x4xi32>
%39 = vector.extract %32[3] : vector<4xi32>
%40 = vector.insert %39, %38 [0, 0, 3] : i32 into vector<1x1x4xi32>
scf.yield %40 : vector<1x1x4xi32>
}
%11 = memref.load %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.reduction <add>, %12, %11 : vector<4xi32> into i32
%14 = vector.insertelement %13, %cst_0[%c0 : index] : vector<1xi32>
%15 = vector.extract %14[0] : vector<1xi32>
memref.store %15, %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%16 = vector.load %9[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%17 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%18 = vector.extract %16[0] : vector<4xi32>
%19 = arith.addi %18, %17 : i32
%20 = vector.insertelement %19, %cst[%c0 : index] : vector<4xi32>
%21 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%22 = vector.extract %16[1] : vector<4xi32>
%23 = arith.addi %22, %21 : i32
%24 = vector.insertelement %23, %20[%c1 : index] : vector<4xi32>
%25 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%26 = vector.extract %16[2] : vector<4xi32>
%27 = arith.addi %26, %25 : i32
%28 = vector.insertelement %27, %24[%c2 : index] : vector<4xi32>
%29 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%30 = vector.extract %16[3] : vector<4xi32>
%31 = arith.addi %30, %29 : i32
%32 = vector.insertelement %31, %28[%c3 : index] : vector<4xi32>
%33 = vector.extract %32[0] : vector<4xi32>
%34 = vector.insert %33, %cst_1 [0, 0, 0] : i32 into vector<1x1x4xi32>
%35 = vector.extract %32[1] : vector<4xi32>
%36 = vector.insert %35, %34 [0, 0, 1] : i32 into vector<1x1x4xi32>
%37 = vector.extract %32[2] : vector<4xi32>
%38 = vector.insert %37, %36 [0, 0, 2] : i32 into vector<1x1x4xi32>
%39 = vector.extract %32[3] : vector<4xi32>
%40 = vector.insert %39, %38 [0, 0, 3] : i32 into vector<1x1x4xi32>
scf.yield %40 : vector<1x1x4xi32>
}
%11 = memref.load %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.reduction <add>, %12, %11 : vector<4xi32> into i32
%14 = vector.insertelement %13, %cst_0[%c0 : index] : vector<1xi32>
%15 = vector.extract %14[0] : vector<1xi32>
memref.store %15, %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LLVMCPULowerExecutableTarget (iree-llvmcpu-lower-executable-target) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c4 {
%6 = memref.subview %5[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%7 = memref.subview %4[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
scf.for %arg2 = %c0 to %c4 step %c1 {
%8 = memref.subview %6[0, %arg2, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%9 = memref.expand_shape %8 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
%10 = scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x4xi32>) {
%16 = vector.load %9[%c0, %c0, %arg3, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%17 = vector.extract %arg4[0, 0, 0] : vector<1x1x4xi32>
%18 = vector.extract %16[0] : vector<4xi32>
%19 = arith.addi %18, %17 : i32
%20 = vector.insertelement %19, %cst[%c0 : index] : vector<4xi32>
%21 = vector.extract %arg4[0, 0, 1] : vector<1x1x4xi32>
%22 = vector.extract %16[1] : vector<4xi32>
%23 = arith.addi %22, %21 : i32
%24 = vector.insertelement %23, %20[%c1 : index] : vector<4xi32>
%25 = vector.extract %arg4[0, 0, 2] : vector<1x1x4xi32>
%26 = vector.extract %16[2] : vector<4xi32>
%27 = arith.addi %26, %25 : i32
%28 = vector.insertelement %27, %24[%c2 : index] : vector<4xi32>
%29 = vector.extract %arg4[0, 0, 3] : vector<1x1x4xi32>
%30 = vector.extract %16[3] : vector<4xi32>
%31 = arith.addi %30, %29 : i32
%32 = vector.insertelement %31, %28[%c3 : index] : vector<4xi32>
%33 = vector.extract %32[0] : vector<4xi32>
%34 = vector.insert %33, %cst_1 [0, 0, 0] : i32 into vector<1x1x4xi32>
%35 = vector.extract %32[1] : vector<4xi32>
%36 = vector.insert %35, %34 [0, 0, 1] : i32 into vector<1x1x4xi32>
%37 = vector.extract %32[2] : vector<4xi32>
%38 = vector.insert %37, %36 [0, 0, 2] : i32 into vector<1x1x4xi32>
%39 = vector.extract %32[3] : vector<4xi32>
%40 = vector.insert %39, %38 [0, 0, 3] : i32 into vector<1x1x4xi32>
scf.yield %40 : vector<1x1x4xi32>
}
%11 = memref.load %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%12 = vector.extract %10[0, 0] : vector<1x1x4xi32>
%13 = vector.reduction <add>, %12, %11 : vector<4xi32> into i32
%14 = vector.insertelement %13, %cst_0[%c0 : index] : vector<1xi32>
%15 = vector.extract %14[0] : vector<1xi32>
memref.store %15, %7[%c0, %arg2] : memref<1x4xi32, strided<[256, 1], offset: ?>>
}
}
}
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
cf.br ^bb1(%c0 : index)
^bb1(%6: index): // 2 preds: ^bb0, ^bb11
%7 = arith.cmpi slt, %6, %c32 : index
cf.cond_br %7, ^bb2, ^bb12
^bb2: // pred: ^bb1
cf.br ^bb3(%c0 : index)
^bb3(%8: index): // 2 preds: ^bb2, ^bb10
%9 = arith.cmpi slt, %8, %c32 : index
cf.cond_br %9, ^bb4, ^bb11
^bb4: // pred: ^bb3
%10 = memref.subview %5[%6, %8, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = memref.subview %4[%6, %8] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
cf.br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb9
%13 = arith.cmpi slt, %12, %c4 : index
cf.cond_br %13, ^bb6, ^bb10
^bb6: // pred: ^bb5
%14 = memref.subview %10[0, %12, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%15 = memref.expand_shape %14 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
cf.br ^bb7(%c0, %cst_1 : index, vector<1x1x4xi32>)
^bb7(%16: index, %17: vector<1x1x4xi32>): // 2 preds: ^bb6, ^bb8
%18 = arith.cmpi slt, %16, %c32 : index
cf.cond_br %18, ^bb8, ^bb9
^bb8: // pred: ^bb7
%19 = vector.load %15[%c0, %c0, %16, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%20 = vector.extract %17[0, 0, 0] : vector<1x1x4xi32>
%21 = vector.extract %19[0] : vector<4xi32>
%22 = arith.addi %21, %20 : i32
%23 = vector.insertelement %22, %cst[%c0 : index] : vector<4xi32>
%24 = vector.extract %17[0, 0, 1] : vector<1x1x4xi32>
%25 = vector.extract %19[1] : vector<4xi32>
%26 = arith.addi %25, %24 : i32
%27 = vector.insertelement %26, %23[%c1 : index] : vector<4xi32>
%28 = vector.extract %17[0, 0, 2] : vector<1x1x4xi32>
%29 = vector.extract %19[2] : vector<4xi32>
%30 = arith.addi %29, %28 : i32
%31 = vector.insertelement %30, %27[%c2 : index] : vector<4xi32>
%32 = vector.extract %17[0, 0, 3] : vector<1x1x4xi32>
%33 = vector.extract %19[3] : vector<4xi32>
%34 = arith.addi %33, %32 : i32
%35 = vector.insertelement %34, %31[%c3 : index] : vector<4xi32>
%36 = vector.extract %35[0] : vector<4xi32>
%37 = vector.insert %36, %cst_1 [0, 0, 0] : i32 into vector<1x1x4xi32>
%38 = vector.extract %35[1] : vector<4xi32>
%39 = vector.insert %38, %37 [0, 0, 1] : i32 into vector<1x1x4xi32>
%40 = vector.extract %35[2] : vector<4xi32>
%41 = vector.insert %40, %39 [0, 0, 2] : i32 into vector<1x1x4xi32>
%42 = vector.extract %35[3] : vector<4xi32>
%43 = vector.insert %42, %41 [0, 0, 3] : i32 into vector<1x1x4xi32>
%44 = arith.addi %16, %c1 : index
cf.br ^bb7(%44, %43 : index, vector<1x1x4xi32>)
^bb9: // pred: ^bb7
%45 = memref.load %11[%c0, %12] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%46 = vector.extract %17[0, 0] : vector<1x1x4xi32>
%47 = vector.reduction <add>, %46, %45 : vector<4xi32> into i32
%48 = vector.insertelement %47, %cst_0[%c0 : index] : vector<1xi32>
%49 = vector.extract %48[0] : vector<1xi32>
memref.store %49, %11[%c0, %12] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%50 = arith.addi %12, %c1 : index
cf.br ^bb5(%50 : index)
^bb10: // pred: ^bb5
%51 = arith.addi %8, %c4 : index
cf.br ^bb3(%51 : index)
^bb11: // pred: ^bb3
%52 = arith.addi %6, %c1 : index
cf.br ^bb1(%52 : index)
^bb12: // pred: ^bb1
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module {
func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
%cst = arith.constant dense<0> : vector<4xi32>
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<1x1x4xi32>
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
memref.assume_alignment %0, 64 : memref<512x256x128xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
memref.assume_alignment %1, 64 : memref<512x256xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply #map()[%workgroup_id_y]
%3 = affine.apply #map()[%workgroup_id_x]
%4 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
%5 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
cf.br ^bb1(%c0 : index)
^bb1(%6: index): // 2 preds: ^bb0, ^bb10
%7 = arith.cmpi slt, %6, %c32 : index
cf.cond_br %7, ^bb2(%c0 : index), ^bb11
^bb2(%8: index): // 2 preds: ^bb1, ^bb9
%9 = arith.cmpi slt, %8, %c32 : index
cf.cond_br %9, ^bb3, ^bb10
^bb3: // pred: ^bb2
%10 = memref.subview %5[%6, %8, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>>
%11 = memref.subview %4[%6, %8] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<1x4xi32, strided<[256, 1], offset: ?>>
cf.br ^bb4(%c0 : index)
^bb4(%12: index): // 2 preds: ^bb3, ^bb8
%13 = arith.cmpi slt, %12, %c4 : index
cf.cond_br %13, ^bb5, ^bb9
^bb5: // pred: ^bb4
%14 = memref.subview %10[0, %12, 0] [1, 1, 128] [1, 1, 1] : memref<1x4x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>>
%15 = memref.expand_shape %14 [[0], [1], [2, 3]] : memref<1x1x128xi32, strided<[32768, 128, 1], offset: ?>> into memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>
cf.br ^bb6(%c0, %cst_1 : index, vector<1x1x4xi32>)
^bb6(%16: index, %17: vector<1x1x4xi32>): // 2 preds: ^bb5, ^bb7
%18 = arith.cmpi slt, %16, %c32 : index
cf.cond_br %18, ^bb7, ^bb8
^bb7: // pred: ^bb6
%19 = vector.load %15[%c0, %c0, %16, %c0] : memref<1x1x32x4xi32, strided<[32768, 128, 4, 1], offset: ?>>, vector<4xi32>
%20 = vector.extract %17[0, 0, 0] : vector<1x1x4xi32>
%21 = vector.extract %19[0] : vector<4xi32>
%22 = arith.addi %21, %20 : i32
%23 = vector.insertelement %22, %cst[%c0 : index] : vector<4xi32>
%24 = vector.extract %17[0, 0, 1] : vector<1x1x4xi32>
%25 = vector.extract %19[1] : vector<4xi32>
%26 = arith.addi %25, %24 : i32
%27 = vector.insertelement %26, %23[%c1 : index] : vector<4xi32>
%28 = vector.extract %17[0, 0, 2] : vector<1x1x4xi32>
%29 = vector.extract %19[2] : vector<4xi32>
%30 = arith.addi %29, %28 : i32
%31 = vector.insertelement %30, %27[%c2 : index] : vector<4xi32>
%32 = vector.extract %17[0, 0, 3] : vector<1x1x4xi32>
%33 = vector.extract %19[3] : vector<4xi32>
%34 = arith.addi %33, %32 : i32
%35 = vector.insertelement %34, %31[%c3 : index] : vector<4xi32>
%36 = vector.extract %35[0] : vector<4xi32>
%37 = vector.insert %36, %cst_1 [0, 0, 0] : i32 into vector<1x1x4xi32>
%38 = vector.extract %35[1] : vector<4xi32>
%39 = vector.insert %38, %37 [0, 0, 1] : i32 into vector<1x1x4xi32>
%40 = vector.extract %35[2] : vector<4xi32>
%41 = vector.insert %40, %39 [0, 0, 2] : i32 into vector<1x1x4xi32>
%42 = vector.extract %35[3] : vector<4xi32>
%43 = vector.insert %42, %41 [0, 0, 3] : i32 into vector<1x1x4xi32>
%44 = arith.addi %16, %c1 : index
cf.br ^bb6(%44, %43 : index, vector<1x1x4xi32>)
^bb8: // pred: ^bb6
%45 = memref.load %11[%c0, %12] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%46 = vector.extract %17[0, 0] : vector<1x1x4xi32>
%47 = vector.reduction <add>, %46, %45 : vector<4xi32> into i32
%48 = vector.insertelement %47, %cst_0[%c0 : index] : vector<1xi32>
%49 = vector.extract %48[0] : vector<1xi32>
memref.store %49, %11[%c0, %12] : memref<1x4xi32, strided<[256, 1], offset: ?>>
%50 = arith.addi %12, %c1 : index
cf.br ^bb4(%50 : index)
^bb9: // pred: ^bb4
%51 = arith.addi %8, %c4 : index
cf.br ^bb2(%51 : index)
^bb10: // pred: ^bb2
%52 = arith.addi %6, %c1 : index
cf.br ^bb1(%52 : index)
^bb11: // pred: ^bb1
return
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ConvertToLLVM (iree-convert-to-llvm) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%1 = llvm.mlir.constant(2 : index) : i64
%2 = llvm.mlir.constant(3 : index) : i64
%3 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%4 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%5 = llvm.mlir.constant(4 : index) : i64
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.mlir.constant(32 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<i32>
%14 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%19 = llvm.mlir.constant(512 : index) : i64
%20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%21 = llvm.mlir.constant(32768 : index) : i64
%22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%23 = llvm.mlir.constant(256 : index) : i64
%24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%25 = llvm.mlir.constant(128 : index) : i64
%26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%27 = llvm.mlir.constant(128 : index) : i64
%28 = llvm.insertvalue %27, %26[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%29 = llvm.mlir.constant(1 : index) : i64
%30 = llvm.insertvalue %29, %28[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%31 = llvm.extractvalue %30[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%32 = llvm.mlir.constant(0 : index) : i64
%33 = llvm.mlir.constant(63 : index) : i64
%34 = llvm.ptrtoint %31 : !llvm.ptr<i32> to i64
%35 = llvm.and %34, %33 : i64
%36 = llvm.icmp "eq" %35, %32 : i64
"llvm.intr.assume"(%36) : (i1) -> ()
%37 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%38 = llvm.extractvalue %37[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%39 = llvm.mlir.constant(1 : i64) : i64
%40 = llvm.getelementptr %38[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%41 = llvm.load %40 : !llvm.ptr<ptr<i8>>
%42 = llvm.bitcast %41 : !llvm.ptr<i8> to !llvm.ptr<i32>
%43 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%44 = llvm.insertvalue %42, %43[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.insertvalue %42, %44[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%46 = llvm.mlir.constant(0 : index) : i64
%47 = llvm.insertvalue %46, %45[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%48 = llvm.mlir.constant(512 : index) : i64
%49 = llvm.insertvalue %48, %47[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%50 = llvm.mlir.constant(256 : index) : i64
%51 = llvm.insertvalue %50, %49[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%52 = llvm.mlir.constant(256 : index) : i64
%53 = llvm.insertvalue %52, %51[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%54 = llvm.mlir.constant(1 : index) : i64
%55 = llvm.insertvalue %54, %53[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%56 = llvm.extractvalue %55[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%57 = llvm.mlir.constant(0 : index) : i64
%58 = llvm.mlir.constant(63 : index) : i64
%59 = llvm.ptrtoint %56 : !llvm.ptr<i32> to i64
%60 = llvm.and %59, %58 : i64
%61 = llvm.icmp "eq" %60, %57 : i64
"llvm.intr.assume"(%61) : (i1) -> ()
%62 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%63 = llvm.extractvalue %62[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%64 = llvm.zext %63 : i32 to i64
%65 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%66 = llvm.extractvalue %65[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%67 = llvm.zext %66 : i32 to i64
%68 = llvm.mlir.constant(32 : index) : i64
%69 = llvm.mul %67, %68 : i64
%70 = llvm.mlir.constant(32 : index) : i64
%71 = llvm.mul %64, %70 : i64
%72 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%73 = llvm.extractvalue %55[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%74 = llvm.bitcast %73 : !llvm.ptr<i32> to !llvm.ptr<i32>
%75 = llvm.insertvalue %74, %72[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%76 = llvm.extractvalue %55[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%77 = llvm.bitcast %76 : !llvm.ptr<i32> to !llvm.ptr<i32>
%78 = llvm.insertvalue %77, %75[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%79 = llvm.extractvalue %55[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%80 = llvm.extractvalue %55[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%81 = llvm.extractvalue %55[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%82 = llvm.mul %69, %79 : i64
%83 = llvm.add %81, %82 : i64
%84 = llvm.mul %71, %80 : i64
%85 = llvm.add %83, %84 : i64
%86 = llvm.insertvalue %85, %78[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%87 = llvm.mlir.constant(32 : i64) : i64
%88 = llvm.mlir.constant(1 : i64) : i64
%89 = llvm.insertvalue %87, %86[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%90 = llvm.insertvalue %88, %89[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%91 = llvm.mlir.constant(32 : i64) : i64
%92 = llvm.mlir.constant(256 : i64) : i64
%93 = llvm.insertvalue %91, %90[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%94 = llvm.insertvalue %92, %93[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%95 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%96 = llvm.extractvalue %30[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<i32>
%98 = llvm.insertvalue %97, %95[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%99 = llvm.extractvalue %30[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%100 = llvm.bitcast %99 : !llvm.ptr<i32> to !llvm.ptr<i32>
%101 = llvm.insertvalue %100, %98[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%102 = llvm.extractvalue %30[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%103 = llvm.extractvalue %30[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%104 = llvm.extractvalue %30[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%105 = llvm.extractvalue %30[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%106 = llvm.mul %69, %102 : i64
%107 = llvm.add %105, %106 : i64
%108 = llvm.mul %71, %103 : i64
%109 = llvm.add %107, %108 : i64
%110 = llvm.mlir.constant(0 : i64) : i64
%111 = llvm.mul %110, %104 : i64
%112 = llvm.add %109, %111 : i64
%113 = llvm.insertvalue %112, %101[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%114 = llvm.mlir.constant(128 : i64) : i64
%115 = llvm.mlir.constant(1 : i64) : i64
%116 = llvm.insertvalue %114, %113[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%117 = llvm.insertvalue %115, %116[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%118 = llvm.mlir.constant(32 : i64) : i64
%119 = llvm.mlir.constant(128 : i64) : i64
%120 = llvm.insertvalue %118, %117[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%121 = llvm.insertvalue %119, %120[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%122 = llvm.mlir.constant(32 : i64) : i64
%123 = llvm.mlir.constant(32768 : i64) : i64
%124 = llvm.insertvalue %122, %121[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%125 = llvm.insertvalue %123, %124[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
llvm.br ^bb1(%8 : i64)
^bb1(%126: i64): // 2 preds: ^bb0, ^bb10
%127 = llvm.icmp "slt" %126, %7 : i64
llvm.cond_br %127, ^bb2(%8 : i64), ^bb11
^bb2(%128: i64): // 2 preds: ^bb1, ^bb9
%129 = llvm.icmp "slt" %128, %7 : i64
llvm.cond_br %129, ^bb3, ^bb10
^bb3: // pred: ^bb2
%130 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%131 = llvm.extractvalue %125[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%132 = llvm.bitcast %131 : !llvm.ptr<i32> to !llvm.ptr<i32>
%133 = llvm.insertvalue %132, %130[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%134 = llvm.extractvalue %125[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%135 = llvm.bitcast %134 : !llvm.ptr<i32> to !llvm.ptr<i32>
%136 = llvm.insertvalue %135, %133[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%137 = llvm.extractvalue %125[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%138 = llvm.extractvalue %125[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%139 = llvm.extractvalue %125[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%140 = llvm.extractvalue %125[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%141 = llvm.mul %126, %137 : i64
%142 = llvm.add %140, %141 : i64
%143 = llvm.mul %128, %138 : i64
%144 = llvm.add %142, %143 : i64
%145 = llvm.mlir.constant(0 : i64) : i64
%146 = llvm.mul %145, %139 : i64
%147 = llvm.add %144, %146 : i64
%148 = llvm.insertvalue %147, %136[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%149 = llvm.mlir.constant(128 : i64) : i64
%150 = llvm.mlir.constant(1 : i64) : i64
%151 = llvm.insertvalue %149, %148[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%152 = llvm.insertvalue %150, %151[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%153 = llvm.mlir.constant(4 : i64) : i64
%154 = llvm.mlir.constant(128 : i64) : i64
%155 = llvm.insertvalue %153, %152[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%156 = llvm.insertvalue %154, %155[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%157 = llvm.mlir.constant(1 : i64) : i64
%158 = llvm.mlir.constant(32768 : i64) : i64
%159 = llvm.insertvalue %157, %156[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%160 = llvm.insertvalue %158, %159[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%161 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%162 = llvm.extractvalue %94[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%163 = llvm.bitcast %162 : !llvm.ptr<i32> to !llvm.ptr<i32>
%164 = llvm.insertvalue %163, %161[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%165 = llvm.extractvalue %94[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%166 = llvm.bitcast %165 : !llvm.ptr<i32> to !llvm.ptr<i32>
%167 = llvm.insertvalue %166, %164[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%168 = llvm.extractvalue %94[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%169 = llvm.extractvalue %94[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%170 = llvm.extractvalue %94[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%171 = llvm.mul %126, %168 : i64
%172 = llvm.add %170, %171 : i64
%173 = llvm.mul %128, %169 : i64
%174 = llvm.add %172, %173 : i64
%175 = llvm.insertvalue %174, %167[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%176 = llvm.mlir.constant(4 : i64) : i64
%177 = llvm.mlir.constant(1 : i64) : i64
%178 = llvm.insertvalue %176, %175[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%179 = llvm.insertvalue %177, %178[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%180 = llvm.mlir.constant(1 : i64) : i64
%181 = llvm.mlir.constant(256 : i64) : i64
%182 = llvm.insertvalue %180, %179[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%183 = llvm.insertvalue %181, %182[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb4(%8 : i64)
^bb4(%184: i64): // 2 preds: ^bb3, ^bb8
%185 = llvm.icmp "slt" %184, %5 : i64
llvm.cond_br %185, ^bb5, ^bb9
^bb5: // pred: ^bb4
%186 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%187 = llvm.extractvalue %160[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%188 = llvm.bitcast %187 : !llvm.ptr<i32> to !llvm.ptr<i32>
%189 = llvm.insertvalue %188, %186[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%190 = llvm.extractvalue %160[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%191 = llvm.bitcast %190 : !llvm.ptr<i32> to !llvm.ptr<i32>
%192 = llvm.insertvalue %191, %189[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%193 = llvm.extractvalue %160[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%194 = llvm.extractvalue %160[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%195 = llvm.extractvalue %160[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%196 = llvm.extractvalue %160[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%197 = llvm.mlir.constant(0 : i64) : i64
%198 = llvm.mul %197, %193 : i64
%199 = llvm.add %196, %198 : i64
%200 = llvm.mul %184, %194 : i64
%201 = llvm.add %199, %200 : i64
%202 = llvm.mlir.constant(0 : i64) : i64
%203 = llvm.mul %202, %195 : i64
%204 = llvm.add %201, %203 : i64
%205 = llvm.insertvalue %204, %192[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%206 = llvm.mlir.constant(128 : i64) : i64
%207 = llvm.mlir.constant(1 : i64) : i64
%208 = llvm.insertvalue %206, %205[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%209 = llvm.insertvalue %207, %208[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%210 = llvm.mlir.constant(1 : i64) : i64
%211 = llvm.mlir.constant(128 : i64) : i64
%212 = llvm.insertvalue %210, %209[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%213 = llvm.insertvalue %211, %212[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%214 = llvm.mlir.constant(1 : i64) : i64
%215 = llvm.mlir.constant(32768 : i64) : i64
%216 = llvm.insertvalue %214, %213[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%217 = llvm.insertvalue %215, %216[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%218 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%219 = llvm.extractvalue %217[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%220 = llvm.insertvalue %219, %218[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%221 = llvm.extractvalue %217[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%222 = llvm.insertvalue %221, %220[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%223 = llvm.extractvalue %217[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%224 = llvm.insertvalue %223, %222[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%225 = llvm.mlir.constant(1 : index) : i64
%226 = llvm.mlir.constant(1 : index) : i64
%227 = llvm.mlir.constant(32 : index) : i64
%228 = llvm.mlir.constant(4 : index) : i64
%229 = llvm.insertvalue %225, %224[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%230 = llvm.insertvalue %226, %229[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%231 = llvm.insertvalue %227, %230[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%232 = llvm.insertvalue %228, %231[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%233 = llvm.mlir.constant(32768 : index) : i64
%234 = llvm.insertvalue %233, %232[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%235 = llvm.mlir.constant(128 : index) : i64
%236 = llvm.insertvalue %235, %234[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%237 = llvm.mlir.constant(4 : index) : i64
%238 = llvm.insertvalue %237, %236[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%239 = llvm.mlir.constant(1 : index) : i64
%240 = llvm.insertvalue %239, %238[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
llvm.br ^bb6(%8, %4 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%241: i64, %242: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%243 = llvm.icmp "slt" %241, %7 : i64
llvm.cond_br %243, ^bb7, ^bb8
^bb7: // pred: ^bb6
%244 = llvm.extractvalue %240[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%245 = llvm.extractvalue %240[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%246 = llvm.mlir.constant(32768 : index) : i64
%247 = llvm.mul %8, %246 : i64
%248 = llvm.add %245, %247 : i64
%249 = llvm.mlir.constant(128 : index) : i64
%250 = llvm.mul %8, %249 : i64
%251 = llvm.add %248, %250 : i64
%252 = llvm.mlir.constant(4 : index) : i64
%253 = llvm.mul %241, %252 : i64
%254 = llvm.add %251, %253 : i64
%255 = llvm.add %254, %8 : i64
%256 = llvm.getelementptr %244[%255] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%257 = llvm.bitcast %256 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%258 = llvm.load %257 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%259 = llvm.extractvalue %242[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%260 = llvm.mlir.constant(0 : i64) : i64
%261 = llvm.extractelement %259[%260 : i64] : vector<4xi32>
%262 = llvm.mlir.constant(0 : i64) : i64
%263 = llvm.extractelement %258[%262 : i64] : vector<4xi32>
%264 = llvm.add %263, %261 : i32
%265 = llvm.insertelement %264, %0[%8 : i64] : vector<4xi32>
%266 = llvm.extractvalue %242[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%267 = llvm.mlir.constant(1 : i64) : i64
%268 = llvm.extractelement %266[%267 : i64] : vector<4xi32>
%269 = llvm.mlir.constant(1 : i64) : i64
%270 = llvm.extractelement %258[%269 : i64] : vector<4xi32>
%271 = llvm.add %270, %268 : i32
%272 = llvm.insertelement %271, %265[%6 : i64] : vector<4xi32>
%273 = llvm.extractvalue %242[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%274 = llvm.mlir.constant(2 : i64) : i64
%275 = llvm.extractelement %273[%274 : i64] : vector<4xi32>
%276 = llvm.mlir.constant(2 : i64) : i64
%277 = llvm.extractelement %258[%276 : i64] : vector<4xi32>
%278 = llvm.add %277, %275 : i32
%279 = llvm.insertelement %278, %272[%1 : i64] : vector<4xi32>
%280 = llvm.extractvalue %242[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%281 = llvm.mlir.constant(3 : i64) : i64
%282 = llvm.extractelement %280[%281 : i64] : vector<4xi32>
%283 = llvm.mlir.constant(3 : i64) : i64
%284 = llvm.extractelement %258[%283 : i64] : vector<4xi32>
%285 = llvm.add %284, %282 : i32
%286 = llvm.insertelement %285, %279[%2 : i64] : vector<4xi32>
%287 = llvm.mlir.constant(0 : i64) : i64
%288 = llvm.extractelement %286[%287 : i64] : vector<4xi32>
%289 = llvm.extractvalue %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%290 = llvm.mlir.constant(0 : i64) : i64
%291 = llvm.insertelement %288, %289[%290 : i64] : vector<4xi32>
%292 = llvm.insertvalue %291, %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%293 = llvm.mlir.constant(1 : i64) : i64
%294 = llvm.extractelement %286[%293 : i64] : vector<4xi32>
%295 = llvm.extractvalue %292[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%296 = llvm.mlir.constant(1 : i64) : i64
%297 = llvm.insertelement %294, %295[%296 : i64] : vector<4xi32>
%298 = llvm.insertvalue %297, %292[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%299 = llvm.mlir.constant(2 : i64) : i64
%300 = llvm.extractelement %286[%299 : i64] : vector<4xi32>
%301 = llvm.extractvalue %298[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%302 = llvm.mlir.constant(2 : i64) : i64
%303 = llvm.insertelement %300, %301[%302 : i64] : vector<4xi32>
%304 = llvm.insertvalue %303, %298[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%305 = llvm.mlir.constant(3 : i64) : i64
%306 = llvm.extractelement %286[%305 : i64] : vector<4xi32>
%307 = llvm.extractvalue %304[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%308 = llvm.mlir.constant(3 : i64) : i64
%309 = llvm.insertelement %306, %307[%308 : i64] : vector<4xi32>
%310 = llvm.insertvalue %309, %304[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%311 = llvm.add %241, %6 : i64
llvm.br ^bb6(%311, %310 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%312 = llvm.extractvalue %183[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%313 = llvm.extractvalue %183[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%314 = llvm.mlir.constant(256 : index) : i64
%315 = llvm.mul %8, %314 : i64
%316 = llvm.add %313, %315 : i64
%317 = llvm.add %316, %184 : i64
%318 = llvm.getelementptr %312[%317] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%319 = llvm.load %318 : !llvm.ptr<i32>
%320 = llvm.extractvalue %242[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%321 = "llvm.intr.vector.reduce.add"(%320) : (vector<4xi32>) -> i32
%322 = llvm.add %319, %321 : i32
%323 = llvm.insertelement %322, %3[%8 : i64] : vector<1xi32>
%324 = llvm.mlir.constant(0 : i64) : i64
%325 = llvm.extractelement %323[%324 : i64] : vector<1xi32>
%326 = llvm.extractvalue %183[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%327 = llvm.extractvalue %183[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%328 = llvm.mlir.constant(256 : index) : i64
%329 = llvm.mul %8, %328 : i64
%330 = llvm.add %327, %329 : i64
%331 = llvm.add %330, %184 : i64
%332 = llvm.getelementptr %326[%331] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
llvm.store %325, %332 : !llvm.ptr<i32>
%333 = llvm.add %184, %6 : i64
llvm.br ^bb4(%333 : i64)
^bb9: // pred: ^bb4
%334 = llvm.add %128, %5 : i64
llvm.br ^bb2(%334 : i64)
^bb10: // pred: ^bb2
%335 = llvm.add %126, %6 : i64
llvm.br ^bb1(%335 : i64)
^bb11: // pred: ^bb1
%336 = llvm.mlir.constant(0 : i32) : i32
llvm.return %336 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After ReconcileUnrealizedCasts (reconcile-unrealized-casts) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%1 = llvm.mlir.constant(2 : index) : i64
%2 = llvm.mlir.constant(3 : index) : i64
%3 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%4 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%5 = llvm.mlir.constant(4 : index) : i64
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.mlir.constant(32 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<i32>
%14 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%19 = llvm.mlir.constant(512 : index) : i64
%20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%21 = llvm.mlir.constant(32768 : index) : i64
%22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%23 = llvm.mlir.constant(256 : index) : i64
%24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%25 = llvm.mlir.constant(128 : index) : i64
%26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%27 = llvm.mlir.constant(128 : index) : i64
%28 = llvm.insertvalue %27, %26[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%29 = llvm.mlir.constant(1 : index) : i64
%30 = llvm.insertvalue %29, %28[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%31 = llvm.mlir.constant(0 : index) : i64
%32 = llvm.mlir.constant(63 : index) : i64
%33 = llvm.ptrtoint %13 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %32 : i64
%35 = llvm.icmp "eq" %34, %31 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%37 = llvm.extractvalue %36[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%38 = llvm.mlir.constant(1 : i64) : i64
%39 = llvm.getelementptr %37[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%40 = llvm.load %39 : !llvm.ptr<ptr<i8>>
%41 = llvm.bitcast %40 : !llvm.ptr<i8> to !llvm.ptr<i32>
%42 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.insertvalue %41, %42[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%44 = llvm.insertvalue %41, %43[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.mlir.constant(0 : index) : i64
%46 = llvm.insertvalue %45, %44[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%47 = llvm.mlir.constant(512 : index) : i64
%48 = llvm.insertvalue %47, %46[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%49 = llvm.mlir.constant(256 : index) : i64
%50 = llvm.insertvalue %49, %48[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%51 = llvm.mlir.constant(256 : index) : i64
%52 = llvm.insertvalue %51, %50[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%53 = llvm.mlir.constant(1 : index) : i64
%54 = llvm.insertvalue %53, %52[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%55 = llvm.mlir.constant(0 : index) : i64
%56 = llvm.mlir.constant(63 : index) : i64
%57 = llvm.ptrtoint %41 : !llvm.ptr<i32> to i64
%58 = llvm.and %57, %56 : i64
%59 = llvm.icmp "eq" %58, %55 : i64
"llvm.intr.assume"(%59) : (i1) -> ()
%60 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%61 = llvm.extractvalue %60[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%62 = llvm.zext %61 : i32 to i64
%63 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%64 = llvm.extractvalue %63[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%65 = llvm.zext %64 : i32 to i64
%66 = llvm.mlir.constant(32 : index) : i64
%67 = llvm.mul %65, %66 : i64
%68 = llvm.mlir.constant(32 : index) : i64
%69 = llvm.mul %62, %68 : i64
%70 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%71 = llvm.insertvalue %41, %70[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%72 = llvm.insertvalue %41, %71[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%73 = llvm.mul %67, %49 : i64
%74 = llvm.add %45, %73 : i64
%75 = llvm.mul %69, %53 : i64
%76 = llvm.add %74, %75 : i64
%77 = llvm.insertvalue %76, %72[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%78 = llvm.mlir.constant(32 : i64) : i64
%79 = llvm.mlir.constant(1 : i64) : i64
%80 = llvm.insertvalue %78, %77[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%81 = llvm.insertvalue %79, %80[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%82 = llvm.mlir.constant(32 : i64) : i64
%83 = llvm.mlir.constant(256 : i64) : i64
%84 = llvm.insertvalue %82, %81[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%86 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%87 = llvm.insertvalue %13, %86[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%88 = llvm.insertvalue %13, %87[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%89 = llvm.mul %67, %21 : i64
%90 = llvm.add %17, %89 : i64
%91 = llvm.mul %69, %25 : i64
%92 = llvm.add %90, %91 : i64
%93 = llvm.mlir.constant(0 : i64) : i64
%94 = llvm.mul %93, %29 : i64
%95 = llvm.add %92, %94 : i64
%96 = llvm.insertvalue %95, %88[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%97 = llvm.mlir.constant(128 : i64) : i64
%98 = llvm.mlir.constant(1 : i64) : i64
%99 = llvm.insertvalue %97, %96[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%100 = llvm.insertvalue %98, %99[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%101 = llvm.mlir.constant(32 : i64) : i64
%102 = llvm.mlir.constant(128 : i64) : i64
%103 = llvm.insertvalue %101, %100[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%104 = llvm.insertvalue %102, %103[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%105 = llvm.mlir.constant(32 : i64) : i64
%106 = llvm.mlir.constant(32768 : i64) : i64
%107 = llvm.insertvalue %105, %104[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%108 = llvm.insertvalue %106, %107[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
llvm.br ^bb1(%8 : i64)
^bb1(%109: i64): // 2 preds: ^bb0, ^bb10
%110 = llvm.icmp "slt" %109, %7 : i64
llvm.cond_br %110, ^bb2(%8 : i64), ^bb11
^bb2(%111: i64): // 2 preds: ^bb1, ^bb9
%112 = llvm.icmp "slt" %111, %7 : i64
llvm.cond_br %112, ^bb3, ^bb10
^bb3: // pred: ^bb2
%113 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%114 = llvm.insertvalue %13, %113[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%115 = llvm.insertvalue %13, %114[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%116 = llvm.mul %109, %106 : i64
%117 = llvm.add %95, %116 : i64
%118 = llvm.mul %111, %102 : i64
%119 = llvm.add %117, %118 : i64
%120 = llvm.mlir.constant(0 : i64) : i64
%121 = llvm.mul %120, %98 : i64
%122 = llvm.add %119, %121 : i64
%123 = llvm.insertvalue %122, %115[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%124 = llvm.mlir.constant(128 : i64) : i64
%125 = llvm.mlir.constant(1 : i64) : i64
%126 = llvm.insertvalue %124, %123[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%127 = llvm.insertvalue %125, %126[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%128 = llvm.mlir.constant(4 : i64) : i64
%129 = llvm.mlir.constant(128 : i64) : i64
%130 = llvm.insertvalue %128, %127[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%131 = llvm.insertvalue %129, %130[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%132 = llvm.mlir.constant(1 : i64) : i64
%133 = llvm.mlir.constant(32768 : i64) : i64
%134 = llvm.insertvalue %132, %131[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%135 = llvm.insertvalue %133, %134[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%136 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%137 = llvm.insertvalue %41, %136[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%138 = llvm.insertvalue %41, %137[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%139 = llvm.mul %109, %83 : i64
%140 = llvm.add %76, %139 : i64
%141 = llvm.mul %111, %79 : i64
%142 = llvm.add %140, %141 : i64
%143 = llvm.insertvalue %142, %138[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%144 = llvm.mlir.constant(4 : i64) : i64
%145 = llvm.mlir.constant(1 : i64) : i64
%146 = llvm.insertvalue %144, %143[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%147 = llvm.insertvalue %145, %146[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%148 = llvm.mlir.constant(1 : i64) : i64
%149 = llvm.mlir.constant(256 : i64) : i64
%150 = llvm.insertvalue %148, %147[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%151 = llvm.insertvalue %149, %150[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb4(%8 : i64)
^bb4(%152: i64): // 2 preds: ^bb3, ^bb8
%153 = llvm.icmp "slt" %152, %5 : i64
llvm.cond_br %153, ^bb5, ^bb9
^bb5: // pred: ^bb4
%154 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%155 = llvm.insertvalue %13, %154[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%156 = llvm.insertvalue %13, %155[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%157 = llvm.mlir.constant(0 : i64) : i64
%158 = llvm.mul %157, %133 : i64
%159 = llvm.add %122, %158 : i64
%160 = llvm.mul %152, %129 : i64
%161 = llvm.add %159, %160 : i64
%162 = llvm.mlir.constant(0 : i64) : i64
%163 = llvm.mul %162, %125 : i64
%164 = llvm.add %161, %163 : i64
%165 = llvm.insertvalue %164, %156[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%166 = llvm.mlir.constant(128 : i64) : i64
%167 = llvm.mlir.constant(1 : i64) : i64
%168 = llvm.insertvalue %166, %165[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%169 = llvm.insertvalue %167, %168[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%170 = llvm.mlir.constant(1 : i64) : i64
%171 = llvm.mlir.constant(128 : i64) : i64
%172 = llvm.insertvalue %170, %169[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%173 = llvm.insertvalue %171, %172[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%174 = llvm.mlir.constant(1 : i64) : i64
%175 = llvm.mlir.constant(32768 : i64) : i64
%176 = llvm.insertvalue %174, %173[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%177 = llvm.insertvalue %175, %176[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%178 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%179 = llvm.insertvalue %13, %178[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%180 = llvm.insertvalue %13, %179[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%181 = llvm.insertvalue %164, %180[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%182 = llvm.mlir.constant(1 : index) : i64
%183 = llvm.mlir.constant(1 : index) : i64
%184 = llvm.mlir.constant(32 : index) : i64
%185 = llvm.mlir.constant(4 : index) : i64
%186 = llvm.insertvalue %182, %181[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%187 = llvm.insertvalue %183, %186[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%188 = llvm.insertvalue %184, %187[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%189 = llvm.insertvalue %185, %188[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%190 = llvm.mlir.constant(32768 : index) : i64
%191 = llvm.insertvalue %190, %189[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%192 = llvm.mlir.constant(128 : index) : i64
%193 = llvm.insertvalue %192, %191[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%194 = llvm.mlir.constant(4 : index) : i64
%195 = llvm.insertvalue %194, %193[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%196 = llvm.mlir.constant(1 : index) : i64
%197 = llvm.insertvalue %196, %195[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
llvm.br ^bb6(%8, %4 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%198: i64, %199: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%200 = llvm.icmp "slt" %198, %7 : i64
llvm.cond_br %200, ^bb7, ^bb8
^bb7: // pred: ^bb6
%201 = llvm.mlir.constant(32768 : index) : i64
%202 = llvm.mul %8, %201 : i64
%203 = llvm.add %164, %202 : i64
%204 = llvm.mlir.constant(128 : index) : i64
%205 = llvm.mul %8, %204 : i64
%206 = llvm.add %203, %205 : i64
%207 = llvm.mlir.constant(4 : index) : i64
%208 = llvm.mul %198, %207 : i64
%209 = llvm.add %206, %208 : i64
%210 = llvm.add %209, %8 : i64
%211 = llvm.getelementptr %13[%210] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%212 = llvm.bitcast %211 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%213 = llvm.load %212 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%214 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%215 = llvm.mlir.constant(0 : i64) : i64
%216 = llvm.extractelement %214[%215 : i64] : vector<4xi32>
%217 = llvm.mlir.constant(0 : i64) : i64
%218 = llvm.extractelement %213[%217 : i64] : vector<4xi32>
%219 = llvm.add %218, %216 : i32
%220 = llvm.insertelement %219, %0[%8 : i64] : vector<4xi32>
%221 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%222 = llvm.mlir.constant(1 : i64) : i64
%223 = llvm.extractelement %221[%222 : i64] : vector<4xi32>
%224 = llvm.mlir.constant(1 : i64) : i64
%225 = llvm.extractelement %213[%224 : i64] : vector<4xi32>
%226 = llvm.add %225, %223 : i32
%227 = llvm.insertelement %226, %220[%6 : i64] : vector<4xi32>
%228 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%229 = llvm.mlir.constant(2 : i64) : i64
%230 = llvm.extractelement %228[%229 : i64] : vector<4xi32>
%231 = llvm.mlir.constant(2 : i64) : i64
%232 = llvm.extractelement %213[%231 : i64] : vector<4xi32>
%233 = llvm.add %232, %230 : i32
%234 = llvm.insertelement %233, %227[%1 : i64] : vector<4xi32>
%235 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%236 = llvm.mlir.constant(3 : i64) : i64
%237 = llvm.extractelement %235[%236 : i64] : vector<4xi32>
%238 = llvm.mlir.constant(3 : i64) : i64
%239 = llvm.extractelement %213[%238 : i64] : vector<4xi32>
%240 = llvm.add %239, %237 : i32
%241 = llvm.insertelement %240, %234[%2 : i64] : vector<4xi32>
%242 = llvm.mlir.constant(0 : i64) : i64
%243 = llvm.extractelement %241[%242 : i64] : vector<4xi32>
%244 = llvm.extractvalue %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%245 = llvm.mlir.constant(0 : i64) : i64
%246 = llvm.insertelement %243, %244[%245 : i64] : vector<4xi32>
%247 = llvm.insertvalue %246, %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%248 = llvm.mlir.constant(1 : i64) : i64
%249 = llvm.extractelement %241[%248 : i64] : vector<4xi32>
%250 = llvm.mlir.constant(1 : i64) : i64
%251 = llvm.insertelement %249, %246[%250 : i64] : vector<4xi32>
%252 = llvm.insertvalue %251, %247[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%253 = llvm.mlir.constant(2 : i64) : i64
%254 = llvm.extractelement %241[%253 : i64] : vector<4xi32>
%255 = llvm.mlir.constant(2 : i64) : i64
%256 = llvm.insertelement %254, %251[%255 : i64] : vector<4xi32>
%257 = llvm.insertvalue %256, %252[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%258 = llvm.mlir.constant(3 : i64) : i64
%259 = llvm.extractelement %241[%258 : i64] : vector<4xi32>
%260 = llvm.mlir.constant(3 : i64) : i64
%261 = llvm.insertelement %259, %256[%260 : i64] : vector<4xi32>
%262 = llvm.insertvalue %261, %257[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%263 = llvm.add %198, %6 : i64
llvm.br ^bb6(%263, %262 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%264 = llvm.mlir.constant(256 : index) : i64
%265 = llvm.mul %8, %264 : i64
%266 = llvm.add %142, %265 : i64
%267 = llvm.add %266, %152 : i64
%268 = llvm.getelementptr %41[%267] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%269 = llvm.load %268 : !llvm.ptr<i32>
%270 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%271 = "llvm.intr.vector.reduce.add"(%270) : (vector<4xi32>) -> i32
%272 = llvm.add %269, %271 : i32
%273 = llvm.insertelement %272, %3[%8 : i64] : vector<1xi32>
%274 = llvm.mlir.constant(0 : i64) : i64
%275 = llvm.extractelement %273[%274 : i64] : vector<1xi32>
%276 = llvm.mlir.constant(256 : index) : i64
%277 = llvm.mul %8, %276 : i64
%278 = llvm.add %142, %277 : i64
%279 = llvm.add %278, %152 : i64
%280 = llvm.getelementptr %41[%279] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
llvm.store %275, %280 : !llvm.ptr<i32>
%281 = llvm.add %152, %6 : i64
llvm.br ^bb4(%281 : i64)
^bb9: // pred: ^bb4
%282 = llvm.add %111, %5 : i64
llvm.br ^bb2(%282 : i64)
^bb10: // pred: ^bb2
%283 = llvm.add %109, %6 : i64
llvm.br ^bb1(%283 : i64)
^bb11: // pred: ^bb1
%284 = llvm.mlir.constant(0 : i32) : i32
llvm.return %284 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility (iree-llvmcpu-synchronize-symbol-visibility) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%1 = llvm.mlir.constant(2 : index) : i64
%2 = llvm.mlir.constant(3 : index) : i64
%3 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%4 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%5 = llvm.mlir.constant(4 : index) : i64
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.mlir.constant(32 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<i32>
%14 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%19 = llvm.mlir.constant(512 : index) : i64
%20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%21 = llvm.mlir.constant(32768 : index) : i64
%22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%23 = llvm.mlir.constant(256 : index) : i64
%24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%25 = llvm.mlir.constant(128 : index) : i64
%26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%27 = llvm.mlir.constant(128 : index) : i64
%28 = llvm.insertvalue %27, %26[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%29 = llvm.mlir.constant(1 : index) : i64
%30 = llvm.insertvalue %29, %28[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%31 = llvm.mlir.constant(0 : index) : i64
%32 = llvm.mlir.constant(63 : index) : i64
%33 = llvm.ptrtoint %13 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %32 : i64
%35 = llvm.icmp "eq" %34, %31 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%37 = llvm.extractvalue %36[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%38 = llvm.mlir.constant(1 : i64) : i64
%39 = llvm.getelementptr %37[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%40 = llvm.load %39 : !llvm.ptr<ptr<i8>>
%41 = llvm.bitcast %40 : !llvm.ptr<i8> to !llvm.ptr<i32>
%42 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.insertvalue %41, %42[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%44 = llvm.insertvalue %41, %43[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.mlir.constant(0 : index) : i64
%46 = llvm.insertvalue %45, %44[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%47 = llvm.mlir.constant(512 : index) : i64
%48 = llvm.insertvalue %47, %46[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%49 = llvm.mlir.constant(256 : index) : i64
%50 = llvm.insertvalue %49, %48[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%51 = llvm.mlir.constant(256 : index) : i64
%52 = llvm.insertvalue %51, %50[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%53 = llvm.mlir.constant(1 : index) : i64
%54 = llvm.insertvalue %53, %52[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%55 = llvm.mlir.constant(0 : index) : i64
%56 = llvm.mlir.constant(63 : index) : i64
%57 = llvm.ptrtoint %41 : !llvm.ptr<i32> to i64
%58 = llvm.and %57, %56 : i64
%59 = llvm.icmp "eq" %58, %55 : i64
"llvm.intr.assume"(%59) : (i1) -> ()
%60 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%61 = llvm.extractvalue %60[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%62 = llvm.zext %61 : i32 to i64
%63 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%64 = llvm.extractvalue %63[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%65 = llvm.zext %64 : i32 to i64
%66 = llvm.mlir.constant(32 : index) : i64
%67 = llvm.mul %65, %66 : i64
%68 = llvm.mlir.constant(32 : index) : i64
%69 = llvm.mul %62, %68 : i64
%70 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%71 = llvm.insertvalue %41, %70[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%72 = llvm.insertvalue %41, %71[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%73 = llvm.mul %67, %49 : i64
%74 = llvm.add %45, %73 : i64
%75 = llvm.mul %69, %53 : i64
%76 = llvm.add %74, %75 : i64
%77 = llvm.insertvalue %76, %72[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%78 = llvm.mlir.constant(32 : i64) : i64
%79 = llvm.mlir.constant(1 : i64) : i64
%80 = llvm.insertvalue %78, %77[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%81 = llvm.insertvalue %79, %80[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%82 = llvm.mlir.constant(32 : i64) : i64
%83 = llvm.mlir.constant(256 : i64) : i64
%84 = llvm.insertvalue %82, %81[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%86 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%87 = llvm.insertvalue %13, %86[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%88 = llvm.insertvalue %13, %87[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%89 = llvm.mul %67, %21 : i64
%90 = llvm.add %17, %89 : i64
%91 = llvm.mul %69, %25 : i64
%92 = llvm.add %90, %91 : i64
%93 = llvm.mlir.constant(0 : i64) : i64
%94 = llvm.mul %93, %29 : i64
%95 = llvm.add %92, %94 : i64
%96 = llvm.insertvalue %95, %88[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%97 = llvm.mlir.constant(128 : i64) : i64
%98 = llvm.mlir.constant(1 : i64) : i64
%99 = llvm.insertvalue %97, %96[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%100 = llvm.insertvalue %98, %99[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%101 = llvm.mlir.constant(32 : i64) : i64
%102 = llvm.mlir.constant(128 : i64) : i64
%103 = llvm.insertvalue %101, %100[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%104 = llvm.insertvalue %102, %103[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%105 = llvm.mlir.constant(32 : i64) : i64
%106 = llvm.mlir.constant(32768 : i64) : i64
%107 = llvm.insertvalue %105, %104[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%108 = llvm.insertvalue %106, %107[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
llvm.br ^bb1(%8 : i64)
^bb1(%109: i64): // 2 preds: ^bb0, ^bb10
%110 = llvm.icmp "slt" %109, %7 : i64
llvm.cond_br %110, ^bb2(%8 : i64), ^bb11
^bb2(%111: i64): // 2 preds: ^bb1, ^bb9
%112 = llvm.icmp "slt" %111, %7 : i64
llvm.cond_br %112, ^bb3, ^bb10
^bb3: // pred: ^bb2
%113 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%114 = llvm.insertvalue %13, %113[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%115 = llvm.insertvalue %13, %114[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%116 = llvm.mul %109, %106 : i64
%117 = llvm.add %95, %116 : i64
%118 = llvm.mul %111, %102 : i64
%119 = llvm.add %117, %118 : i64
%120 = llvm.mlir.constant(0 : i64) : i64
%121 = llvm.mul %120, %98 : i64
%122 = llvm.add %119, %121 : i64
%123 = llvm.insertvalue %122, %115[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%124 = llvm.mlir.constant(128 : i64) : i64
%125 = llvm.mlir.constant(1 : i64) : i64
%126 = llvm.insertvalue %124, %123[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%127 = llvm.insertvalue %125, %126[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%128 = llvm.mlir.constant(4 : i64) : i64
%129 = llvm.mlir.constant(128 : i64) : i64
%130 = llvm.insertvalue %128, %127[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%131 = llvm.insertvalue %129, %130[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%132 = llvm.mlir.constant(1 : i64) : i64
%133 = llvm.mlir.constant(32768 : i64) : i64
%134 = llvm.insertvalue %132, %131[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%135 = llvm.insertvalue %133, %134[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%136 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%137 = llvm.insertvalue %41, %136[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%138 = llvm.insertvalue %41, %137[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%139 = llvm.mul %109, %83 : i64
%140 = llvm.add %76, %139 : i64
%141 = llvm.mul %111, %79 : i64
%142 = llvm.add %140, %141 : i64
%143 = llvm.insertvalue %142, %138[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%144 = llvm.mlir.constant(4 : i64) : i64
%145 = llvm.mlir.constant(1 : i64) : i64
%146 = llvm.insertvalue %144, %143[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%147 = llvm.insertvalue %145, %146[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%148 = llvm.mlir.constant(1 : i64) : i64
%149 = llvm.mlir.constant(256 : i64) : i64
%150 = llvm.insertvalue %148, %147[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
%151 = llvm.insertvalue %149, %150[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb4(%8 : i64)
^bb4(%152: i64): // 2 preds: ^bb3, ^bb8
%153 = llvm.icmp "slt" %152, %5 : i64
llvm.cond_br %153, ^bb5, ^bb9
^bb5: // pred: ^bb4
%154 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%155 = llvm.insertvalue %13, %154[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%156 = llvm.insertvalue %13, %155[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%157 = llvm.mlir.constant(0 : i64) : i64
%158 = llvm.mul %157, %133 : i64
%159 = llvm.add %122, %158 : i64
%160 = llvm.mul %152, %129 : i64
%161 = llvm.add %159, %160 : i64
%162 = llvm.mlir.constant(0 : i64) : i64
%163 = llvm.mul %162, %125 : i64
%164 = llvm.add %161, %163 : i64
%165 = llvm.insertvalue %164, %156[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%166 = llvm.mlir.constant(128 : i64) : i64
%167 = llvm.mlir.constant(1 : i64) : i64
%168 = llvm.insertvalue %166, %165[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%169 = llvm.insertvalue %167, %168[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%170 = llvm.mlir.constant(1 : i64) : i64
%171 = llvm.mlir.constant(128 : i64) : i64
%172 = llvm.insertvalue %170, %169[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%173 = llvm.insertvalue %171, %172[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%174 = llvm.mlir.constant(1 : i64) : i64
%175 = llvm.mlir.constant(32768 : i64) : i64
%176 = llvm.insertvalue %174, %173[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%177 = llvm.insertvalue %175, %176[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
%178 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%179 = llvm.insertvalue %13, %178[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%180 = llvm.insertvalue %13, %179[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%181 = llvm.insertvalue %164, %180[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%182 = llvm.mlir.constant(1 : index) : i64
%183 = llvm.mlir.constant(1 : index) : i64
%184 = llvm.mlir.constant(32 : index) : i64
%185 = llvm.mlir.constant(4 : index) : i64
%186 = llvm.insertvalue %182, %181[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%187 = llvm.insertvalue %183, %186[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%188 = llvm.insertvalue %184, %187[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%189 = llvm.insertvalue %185, %188[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%190 = llvm.mlir.constant(32768 : index) : i64
%191 = llvm.insertvalue %190, %189[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%192 = llvm.mlir.constant(128 : index) : i64
%193 = llvm.insertvalue %192, %191[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%194 = llvm.mlir.constant(4 : index) : i64
%195 = llvm.insertvalue %194, %193[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
%196 = llvm.mlir.constant(1 : index) : i64
%197 = llvm.insertvalue %196, %195[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
llvm.br ^bb6(%8, %4 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%198: i64, %199: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%200 = llvm.icmp "slt" %198, %7 : i64
llvm.cond_br %200, ^bb7, ^bb8
^bb7: // pred: ^bb6
%201 = llvm.mlir.constant(32768 : index) : i64
%202 = llvm.mul %8, %201 : i64
%203 = llvm.add %164, %202 : i64
%204 = llvm.mlir.constant(128 : index) : i64
%205 = llvm.mul %8, %204 : i64
%206 = llvm.add %203, %205 : i64
%207 = llvm.mlir.constant(4 : index) : i64
%208 = llvm.mul %198, %207 : i64
%209 = llvm.add %206, %208 : i64
%210 = llvm.add %209, %8 : i64
%211 = llvm.getelementptr %13[%210] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%212 = llvm.bitcast %211 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%213 = llvm.load %212 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%214 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%215 = llvm.mlir.constant(0 : i64) : i64
%216 = llvm.extractelement %214[%215 : i64] : vector<4xi32>
%217 = llvm.mlir.constant(0 : i64) : i64
%218 = llvm.extractelement %213[%217 : i64] : vector<4xi32>
%219 = llvm.add %218, %216 : i32
%220 = llvm.insertelement %219, %0[%8 : i64] : vector<4xi32>
%221 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%222 = llvm.mlir.constant(1 : i64) : i64
%223 = llvm.extractelement %221[%222 : i64] : vector<4xi32>
%224 = llvm.mlir.constant(1 : i64) : i64
%225 = llvm.extractelement %213[%224 : i64] : vector<4xi32>
%226 = llvm.add %225, %223 : i32
%227 = llvm.insertelement %226, %220[%6 : i64] : vector<4xi32>
%228 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%229 = llvm.mlir.constant(2 : i64) : i64
%230 = llvm.extractelement %228[%229 : i64] : vector<4xi32>
%231 = llvm.mlir.constant(2 : i64) : i64
%232 = llvm.extractelement %213[%231 : i64] : vector<4xi32>
%233 = llvm.add %232, %230 : i32
%234 = llvm.insertelement %233, %227[%1 : i64] : vector<4xi32>
%235 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%236 = llvm.mlir.constant(3 : i64) : i64
%237 = llvm.extractelement %235[%236 : i64] : vector<4xi32>
%238 = llvm.mlir.constant(3 : i64) : i64
%239 = llvm.extractelement %213[%238 : i64] : vector<4xi32>
%240 = llvm.add %239, %237 : i32
%241 = llvm.insertelement %240, %234[%2 : i64] : vector<4xi32>
%242 = llvm.mlir.constant(0 : i64) : i64
%243 = llvm.extractelement %241[%242 : i64] : vector<4xi32>
%244 = llvm.extractvalue %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%245 = llvm.mlir.constant(0 : i64) : i64
%246 = llvm.insertelement %243, %244[%245 : i64] : vector<4xi32>
%247 = llvm.insertvalue %246, %4[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%248 = llvm.mlir.constant(1 : i64) : i64
%249 = llvm.extractelement %241[%248 : i64] : vector<4xi32>
%250 = llvm.mlir.constant(1 : i64) : i64
%251 = llvm.insertelement %249, %246[%250 : i64] : vector<4xi32>
%252 = llvm.insertvalue %251, %247[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%253 = llvm.mlir.constant(2 : i64) : i64
%254 = llvm.extractelement %241[%253 : i64] : vector<4xi32>
%255 = llvm.mlir.constant(2 : i64) : i64
%256 = llvm.insertelement %254, %251[%255 : i64] : vector<4xi32>
%257 = llvm.insertvalue %256, %252[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%258 = llvm.mlir.constant(3 : i64) : i64
%259 = llvm.extractelement %241[%258 : i64] : vector<4xi32>
%260 = llvm.mlir.constant(3 : i64) : i64
%261 = llvm.insertelement %259, %256[%260 : i64] : vector<4xi32>
%262 = llvm.insertvalue %261, %257[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%263 = llvm.add %198, %6 : i64
llvm.br ^bb6(%263, %262 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%264 = llvm.mlir.constant(256 : index) : i64
%265 = llvm.mul %8, %264 : i64
%266 = llvm.add %142, %265 : i64
%267 = llvm.add %266, %152 : i64
%268 = llvm.getelementptr %41[%267] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%269 = llvm.load %268 : !llvm.ptr<i32>
%270 = llvm.extractvalue %199[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%271 = "llvm.intr.vector.reduce.add"(%270) : (vector<4xi32>) -> i32
%272 = llvm.add %269, %271 : i32
%273 = llvm.insertelement %272, %3[%8 : i64] : vector<1xi32>
%274 = llvm.mlir.constant(0 : i64) : i64
%275 = llvm.extractelement %273[%274 : i64] : vector<1xi32>
%276 = llvm.mlir.constant(256 : index) : i64
%277 = llvm.mul %8, %276 : i64
%278 = llvm.add %142, %277 : i64
%279 = llvm.add %278, %152 : i64
%280 = llvm.getelementptr %41[%279] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
llvm.store %275, %280 : !llvm.ptr<i32>
%281 = llvm.add %152, %6 : i64
llvm.br ^bb4(%281 : i64)
^bb9: // pred: ^bb4
%282 = llvm.add %111, %5 : i64
llvm.br ^bb2(%282 : i64)
^bb10: // pred: ^bb2
%283 = llvm.add %109, %6 : i64
llvm.br ^bb1(%283 : i64)
^bb11: // pred: ^bb1
%284 = llvm.mlir.constant(0 : i32) : i32
llvm.return %284 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%40 = llvm.extractvalue %39[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%41 = llvm.zext %40 : i32 to i64
%42 = llvm.mul %41, %19 : i64
%43 = llvm.mul %38, %19 : i64
%44 = llvm.mul %42, %9 : i64
%45 = llvm.add %44, %20 : i64
%46 = llvm.mul %43, %18 : i64
%47 = llvm.add %45, %46 : i64
%48 = llvm.mul %42, %10 : i64
%49 = llvm.add %48, %20 : i64
%50 = llvm.mul %43, %8 : i64
%51 = llvm.add %49, %50 : i64
%52 = llvm.mul %11, %18 : i64
%53 = llvm.add %51, %52 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%54: i64): // 2 preds: ^bb0, ^bb10
%55 = llvm.icmp "slt" %54, %19 : i64
llvm.cond_br %55, ^bb2(%20 : i64), ^bb11
^bb2(%56: i64): // 2 preds: ^bb1, ^bb9
%57 = llvm.icmp "slt" %56, %19 : i64
llvm.cond_br %57, ^bb3, ^bb10
^bb3: // pred: ^bb2
%58 = llvm.mul %54, %3 : i64
%59 = llvm.add %53, %58 : i64
%60 = llvm.mul %56, %4 : i64
%61 = llvm.add %59, %60 : i64
%62 = llvm.mul %11, %6 : i64
%63 = llvm.add %61, %62 : i64
%64 = llvm.mul %54, %5 : i64
%65 = llvm.add %47, %64 : i64
%66 = llvm.mul %56, %6 : i64
%67 = llvm.add %65, %66 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%68: i64): // 2 preds: ^bb3, ^bb8
%69 = llvm.icmp "slt" %68, %17 : i64
llvm.cond_br %69, ^bb5, ^bb9
^bb5: // pred: ^bb4
%70 = llvm.mul %11, %3 : i64
%71 = llvm.add %63, %70 : i64
%72 = llvm.mul %68, %4 : i64
%73 = llvm.add %71, %72 : i64
%74 = llvm.mul %11, %6 : i64
%75 = llvm.add %73, %74 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%76: i64, %77: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%78 = llvm.icmp "slt" %76, %19 : i64
llvm.cond_br %78, ^bb7, ^bb8
^bb7: // pred: ^bb6
%79 = llvm.mul %20, %10 : i64
%80 = llvm.add %75, %79 : i64
%81 = llvm.mul %20, %8 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.mul %76, %17 : i64
%84 = llvm.add %82, %83 : i64
%85 = llvm.add %84, %20 : i64
%86 = llvm.getelementptr %24[%85] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%87 = llvm.bitcast %86 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%88 = llvm.load %87 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%89 = llvm.extractvalue %77[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%90 = llvm.extractelement %89[%11 : i64] : vector<4xi32>
%91 = llvm.extractelement %88[%11 : i64] : vector<4xi32>
%92 = llvm.add %91, %90 : i32
%93 = llvm.insertelement %92, %12[%20 : i64] : vector<4xi32>
%94 = llvm.extractvalue %77[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%95 = llvm.extractelement %94[%6 : i64] : vector<4xi32>
%96 = llvm.extractelement %88[%6 : i64] : vector<4xi32>
%97 = llvm.add %96, %95 : i32
%98 = llvm.insertelement %97, %93[%18 : i64] : vector<4xi32>
%99 = llvm.extractvalue %77[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%100 = llvm.extractelement %99[%2 : i64] : vector<4xi32>
%101 = llvm.extractelement %88[%2 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %98[%13 : i64] : vector<4xi32>
%104 = llvm.extractvalue %77[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%105 = llvm.extractelement %104[%1 : i64] : vector<4xi32>
%106 = llvm.extractelement %88[%1 : i64] : vector<4xi32>
%107 = llvm.add %106, %105 : i32
%108 = llvm.insertelement %107, %103[%14 : i64] : vector<4xi32>
%109 = llvm.extractelement %108[%11 : i64] : vector<4xi32>
%110 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.insertelement %109, %110[%11 : i64] : vector<4xi32>
%112 = llvm.insertvalue %111, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%113 = llvm.extractelement %108[%6 : i64] : vector<4xi32>
%114 = llvm.insertelement %113, %111[%6 : i64] : vector<4xi32>
%115 = llvm.insertvalue %114, %112[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%116 = llvm.extractelement %108[%2 : i64] : vector<4xi32>
%117 = llvm.insertelement %116, %114[%2 : i64] : vector<4xi32>
%118 = llvm.insertvalue %117, %115[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%119 = llvm.extractelement %108[%1 : i64] : vector<4xi32>
%120 = llvm.insertelement %119, %117[%1 : i64] : vector<4xi32>
%121 = llvm.insertvalue %120, %118[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%122 = llvm.add %76, %18 : i64
llvm.br ^bb6(%122, %121 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%123 = llvm.mul %20, %9 : i64
%124 = llvm.add %67, %123 : i64
%125 = llvm.add %124, %68 : i64
%126 = llvm.getelementptr %32[%125] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%127 = llvm.load %126 : !llvm.ptr<i32>
%128 = llvm.extractvalue %77[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%129 = "llvm.intr.vector.reduce.add"(%128) : (vector<4xi32>) -> i32
%130 = llvm.add %127, %129 : i32
%131 = llvm.insertelement %130, %15[%20 : i64] : vector<1xi32>
%132 = llvm.extractelement %131[%11 : i64] : vector<1xi32>
%133 = llvm.mul %20, %9 : i64
%134 = llvm.add %67, %133 : i64
%135 = llvm.add %134, %68 : i64
%136 = llvm.getelementptr %32[%135] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
llvm.store %132, %136 : !llvm.ptr<i32>
%137 = llvm.add %68, %18 : i64
llvm.br ^bb4(%137 : i64)
^bb9: // pred: ^bb4
%138 = llvm.add %56, %17 : i64
llvm.br ^bb2(%138 : i64)
^bb10: // pred: ^bb2
%139 = llvm.add %54, %18 : i64
llvm.br ^bb1(%139 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) ('hal.executable' operation: @_split_reduction_pass2_dispatch_0) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
%3 = util.do_not_optimize(%2) : !stream.resource<transient>
%4 = stream.resource.size %3 : !stream.resource<transient>
%5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
%6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
}
} => !stream.timepoint
%7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
%8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
%10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
%11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
check.expect_eq(%10, %11) : tensor<512x256xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass (iree-hal-conversion) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%device_0 = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%c-1_i64 = arith.constant -1 : i64
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%c-1_i32 = arith.constant -1 : i32
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%device_1 = hal.ex.shared_device : !hal.device
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
%buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%device_4 = hal.ex.shared_device : !hal.device
%cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%2 = hal.command_buffer.device<%cmd_5 : !hal.command_buffer> : !hal.device
hal.device.switch<%2 : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
%c0_22 = arith.constant 0 : index
%c1_23 = arith.constant 1 : index
%c0_24 = arith.constant 0 : index
hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_24] bindings([
%c0_22 = (%1 : !hal.buffer)[%c0, %len],
%c1_23 = (%buffer_3 : !hal.buffer)[%c0, %c1048576]
])
%c1_25 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_25])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer>
%3 = util.null : !hal.fence
%fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence
%c-1_i64_7 = arith.constant -1 : i64
hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64_7) wait(%3) signal(%fence_6) commands([%cmd_5])
%c-1_i32_8 = arith.constant -1 : i32
%status_9 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32_8) : i32
util.status.check_ok %status_9, "failed to wait on timepoint"
%buffer_10 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c0, %c524288] : !hal.buffer
%buffer_11 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c524288, %c524288] : !hal.buffer
%c512_12 = arith.constant 512 : index
%c256_13 = arith.constant 256 : index
%c0_14 = arith.constant 0 : index
%c268435488_i32 = arith.constant 268435488 : i32
%c1_i32_15 = arith.constant 1 : i32
%view = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_14, %c524288] shape([%c512_12, %c256_13]) type(%c268435488_i32) encoding(%c1_i32_15) : !hal.buffer_view
%c512_16 = arith.constant 512 : index
%c256_17 = arith.constant 256 : index
%c0_18 = arith.constant 0 : index
%c268435488_i32_19 = arith.constant 268435488 : i32
%c1_i32_20 = arith.constant 1 : i32
%view_21 = hal.buffer_view.create buffer(%buffer_11 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32_19) encoding(%c1_i32_20) : !hal.buffer_view
check.expect_eq(%view, %view_21) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::FixupLegacySyncPass (iree-hal-fixup-legacy-sync) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%device_0 = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%status = hal.fence.await until([%0]) timeout_millis(%c-1_i32) : i32
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%c-1_i32_1 = arith.constant -1 : i32
%status_2 = hal.fence.await until([%fence]) timeout_millis(%c-1_i32_1) : i32
util.status.check_ok %status_2, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%device_3 = hal.ex.shared_device : !hal.device
%allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
%buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%device_6 = hal.ex.shared_device : !hal.device
%cmd_7 = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%2 = hal.command_buffer.device<%cmd_7 : !hal.command_buffer> : !hal.device
hal.device.switch<%2 : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
%c0_26 = arith.constant 0 : index
%c1_27 = arith.constant 1 : index
%c0_28 = arith.constant 0 : index
hal.command_buffer.push_descriptor_set<%cmd_7 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_28] bindings([
%c0_26 = (%1 : !hal.buffer)[%c0, %len],
%c1_27 = (%buffer_5 : !hal.buffer)[%c0, %c1048576]
])
%c1_29 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.command_buffer.dispatch.symbol<%cmd_7 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_29])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_7 : !hal.command_buffer> target(%buffer_5 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_7 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_7 : !hal.command_buffer>
%3 = util.null : !hal.fence
%fence_8 = hal.fence.create device(%device_6 : !hal.device) flags("None") : !hal.fence
%c-1_i64_9 = arith.constant -1 : i64
%c-1_i32_10 = arith.constant -1 : i32
%status_11 = hal.fence.await until([%3]) timeout_millis(%c-1_i32_10) : i32
hal.device.queue.execute<%device_6 : !hal.device> affinity(%c-1_i64_9) wait(%3) signal(%fence_8) commands([%cmd_7])
%c-1_i32_12 = arith.constant -1 : i32
%status_13 = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32_12) : i32
util.status.check_ok %status_13, "failed to wait on timepoint"
%buffer_14 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c0, %c524288] : !hal.buffer
%buffer_15 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c524288, %c524288] : !hal.buffer
%c512_16 = arith.constant 512 : index
%c256_17 = arith.constant 256 : index
%c0_18 = arith.constant 0 : index
%c268435488_i32 = arith.constant 268435488 : i32
%c1_i32_19 = arith.constant 1 : i32
%view = hal.buffer_view.create buffer(%buffer_14 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32) encoding(%c1_i32_19) : !hal.buffer_view
%c512_20 = arith.constant 512 : index
%c256_21 = arith.constant 256 : index
%c0_22 = arith.constant 0 : index
%c268435488_i32_23 = arith.constant 268435488 : i32
%c1_i32_24 = arith.constant 1 : i32
%view_25 = hal.buffer_view.create buffer(%buffer_15 : !hal.buffer)[%c0_22, %c524288] shape([%c512_20, %c256_21]) type(%c268435488_i32_23) encoding(%c1_i32_24) : !hal.buffer_view
check.expect_eq(%view, %view_25) : !hal.buffer_view
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%device_0 = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%device_1 = hal.ex.shared_device : !hal.device
%allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
%buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%device_4 = hal.ex.shared_device : !hal.device
%cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.device.switch<%device_4 : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%pipeline_layout = hal.pipeline_layout.lookup device(%device_4 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_3 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer>
%2 = util.null : !hal.fence
%fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64) wait(%2) signal(%fence_6) commands([%cmd_5])
%status_7 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_7, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_8 = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_8) : !hal.buffer_view
return
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch.symbol<%cmd_1 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
%exe = hal.executable.lookup device(%2 : !hal.device) executable(@_split_reduction_pass2_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.switch<%device : !hal.device> -> !hal.executable
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
hal.return %exe : !hal.executable
},
#hal.match.always {
%1 = util.null : !hal.executable
hal.return %1 : !hal.executable
}
util.global.store %0, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = arith.constant true
cf.cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
cf.br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"embedded-elf-x86_64"> {
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.return
}
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = arith.constant true
cf.cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
cf.br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
cf.br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = arith.constant true
cf.cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
cf.br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
cf.br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.global.store %ok, @_device_query_0_ok : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
util.initializer.return
}
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%_device_query_0 = util.global.load @_device_query_0 : i1
%device = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%device_0 = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device_1 = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
%device_2 = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
cf.br ^bb4
^bb4: // pred: ^bb3
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%device_0 = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device_1 = hal.ex.shared_device : !hal.device
%pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
%device_2 = hal.ex.shared_device : !hal.device
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
util.global.store %value, @_device_query_0 : i1
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c128_i32 = arith.constant 128 : i32
%c1_i32 = arith.constant 1 : i32
%c67108864 = arith.constant 67108864 : index
%c524288 = arith.constant 524288 : index
%c0 = arith.constant 0 : index
%c1048576 = arith.constant 1048576 : index
%c-1_i64 = arith.constant -1 : i64
%c-1_i32 = arith.constant -1 : i32
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c268435488_i32 = arith.constant 268435488 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
hal.return %c8, %c16, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(32768 : i64) : i64
%4 = llvm.mlir.constant(128 : i64) : i64
%5 = llvm.mlir.constant(256 : i64) : i64
%6 = llvm.mlir.constant(1 : i64) : i64
%7 = llvm.mlir.constant(63 : index) : i64
%8 = llvm.mlir.constant(128 : index) : i64
%9 = llvm.mlir.constant(256 : index) : i64
%10 = llvm.mlir.constant(32768 : index) : i64
%11 = llvm.mlir.constant(0 : i64) : i64
%12 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
%13 = llvm.mlir.constant(2 : index) : i64
%14 = llvm.mlir.constant(3 : index) : i64
%15 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
%16 = llvm.mlir.constant(dense<0> : vector<1x1x4xi32>) : !llvm.array<1 x array<1 x vector<4xi32>>>
%17 = llvm.mlir.constant(4 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(0 : index) : i64
%21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
%24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<i32>
%25 = llvm.ptrtoint %24 : !llvm.ptr<i32> to i64
%26 = llvm.and %25, %7 : i64
%27 = llvm.icmp "eq" %26, %20 : i64
"llvm.intr.assume"(%27) : (i1) -> ()
%28 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.getelementptr %29[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.bitcast %31 : !llvm.ptr<i8> to !llvm.ptr<i32>
%33 = llvm.ptrtoint %32 : !llvm.ptr<i32> to i64
%34 = llvm.and %33, %7 : i64
%35 = llvm.icmp "eq" %34, %20 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
%37 = llvm.extractvalue %36[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.extractvalue %36[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
%40 = llvm.zext %39 : i32 to i64
%41 = llvm.mul %40, %19 : i64
%42 = llvm.mul %38, %19 : i64
%43 = llvm.mul %41, %9 : i64
%44 = llvm.add %43, %20 : i64
%45 = llvm.mul %42, %18 : i64
%46 = llvm.add %44, %45 : i64
%47 = llvm.mul %41, %10 : i64
%48 = llvm.add %47, %20 : i64
%49 = llvm.mul %42, %8 : i64
%50 = llvm.add %48, %49 : i64
%51 = llvm.mul %11, %18 : i64
%52 = llvm.add %50, %51 : i64
llvm.br ^bb1(%20 : i64)
^bb1(%53: i64): // 2 preds: ^bb0, ^bb10
%54 = llvm.icmp "slt" %53, %19 : i64
llvm.cond_br %54, ^bb2(%20 : i64), ^bb11
^bb2(%55: i64): // 2 preds: ^bb1, ^bb9
%56 = llvm.icmp "slt" %55, %19 : i64
llvm.cond_br %56, ^bb3, ^bb10
^bb3: // pred: ^bb2
%57 = llvm.mul %53, %3 : i64
%58 = llvm.add %52, %57 : i64
%59 = llvm.mul %55, %4 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mul %11, %6 : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.mul %53, %5 : i64
%64 = llvm.add %46, %63 : i64
%65 = llvm.mul %55, %6 : i64
%66 = llvm.add %64, %65 : i64
llvm.br ^bb4(%20 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %17 : i64
llvm.cond_br %68, ^bb5, ^bb9
^bb5: // pred: ^bb4
%69 = llvm.mul %11, %3 : i64
%70 = llvm.add %62, %69 : i64
%71 = llvm.mul %67, %4 : i64
%72 = llvm.add %70, %71 : i64
%73 = llvm.add %72, %61 : i64
llvm.br ^bb6(%20, %16 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb6(%74: i64, %75: !llvm.array<1 x array<1 x vector<4xi32>>>): // 2 preds: ^bb5, ^bb7
%76 = llvm.icmp "slt" %74, %19 : i64
llvm.cond_br %76, ^bb7, ^bb8
^bb7: // pred: ^bb6
%77 = llvm.mul %20, %10 : i64
%78 = llvm.add %73, %77 : i64
%79 = llvm.mul %20, %8 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.mul %74, %17 : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.add %82, %20 : i64
%84 = llvm.getelementptr %24[%83] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%85 = llvm.bitcast %84 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
%86 = llvm.load %85 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
%87 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%88 = llvm.extractelement %87[%11 : i64] : vector<4xi32>
%89 = llvm.extractelement %86[%11 : i64] : vector<4xi32>
%90 = llvm.add %89, %88 : i32
%91 = llvm.insertelement %90, %12[%20 : i64] : vector<4xi32>
%92 = llvm.extractelement %87[%6 : i64] : vector<4xi32>
%93 = llvm.extractelement %86[%6 : i64] : vector<4xi32>
%94 = llvm.add %93, %92 : i32
%95 = llvm.insertelement %94, %91[%18 : i64] : vector<4xi32>
%96 = llvm.extractelement %87[%2 : i64] : vector<4xi32>
%97 = llvm.extractelement %86[%2 : i64] : vector<4xi32>
%98 = llvm.add %97, %96 : i32
%99 = llvm.insertelement %98, %95[%13 : i64] : vector<4xi32>
%100 = llvm.extractelement %87[%1 : i64] : vector<4xi32>
%101 = llvm.extractelement %86[%1 : i64] : vector<4xi32>
%102 = llvm.add %101, %100 : i32
%103 = llvm.insertelement %102, %99[%14 : i64] : vector<4xi32>
%104 = llvm.extractelement %103[%11 : i64] : vector<4xi32>
%105 = llvm.extractvalue %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%106 = llvm.insertelement %104, %105[%11 : i64] : vector<4xi32>
%107 = llvm.insertvalue %106, %16[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%108 = llvm.extractelement %103[%6 : i64] : vector<4xi32>
%109 = llvm.insertelement %108, %106[%6 : i64] : vector<4xi32>
%110 = llvm.insertvalue %109, %107[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%111 = llvm.extractelement %103[%2 : i64] : vector<4xi32>
%112 = llvm.insertelement %111, %109[%2 : i64] : vector<4xi32>
%113 = llvm.insertvalue %112, %110[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%114 = llvm.extractelement %103[%1 : i64] : vector<4xi32>
%115 = llvm.insertelement %114, %112[%1 : i64] : vector<4xi32>
%116 = llvm.insertvalue %115, %113[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%117 = llvm.add %74, %18 : i64
llvm.br ^bb6(%117, %116 : i64, !llvm.array<1 x array<1 x vector<4xi32>>>)
^bb8: // pred: ^bb6
%118 = llvm.mul %20, %9 : i64
%119 = llvm.add %66, %118 : i64
%120 = llvm.add %119, %67 : i64
%121 = llvm.getelementptr %32[%120] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
%122 = llvm.load %121 : !llvm.ptr<i32>
%123 = llvm.extractvalue %75[0, 0] : !llvm.array<1 x array<1 x vector<4xi32>>>
%124 = "llvm.intr.vector.reduce.add"(%123) : (vector<4xi32>) -> i32
%125 = llvm.add %122, %124 : i32
%126 = llvm.insertelement %125, %15[%20 : i64] : vector<1xi32>
%127 = llvm.extractelement %126[%11 : i64] : vector<1xi32>
llvm.store %127, %121 : !llvm.ptr<i32>
%128 = llvm.add %67, %18 : i64
llvm.br ^bb4(%128 : i64)
^bb9: // pred: ^bb4
%129 = llvm.add %55, %17 : i64
llvm.br ^bb2(%129 : i64)
^bb10: // pred: ^bb2
%130 = llvm.add %53, %18 : i64
llvm.br ^bb1(%130 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) ('hal.executable' operation: @_split_reduction_pass2_dispatch_0) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"vector<4616xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass (iree-hal-serialize-executables) ('hal.executable' operation: @_split_reduction_pass2_dispatch_0) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"vector<4616xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
}
func.func private @_split_reduction_pass2() {
%c268435488_i32 = arith.constant 268435488 : i32
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1048576 = arith.constant 1048576 : index
%c0 = arith.constant 0 : index
%c524288 = arith.constant 524288 : index
%c67108864 = arith.constant 67108864 : index
%c1_i32 = arith.constant 1 : i32
%c128_i32 = arith.constant 128 : i32
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%_device_query_0 = util.global.load @_device_query_0 : i1
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
%1 = util.do_not_optimize(%buffer) : !hal.buffer
%len = hal.buffer.length<%1 : !hal.buffer> : index
%buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
%cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
cf.cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%1 : !hal.buffer)[%c0, %len],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
])
hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
%fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
%status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status_3, "failed to wait on timepoint"
%view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
%view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
check.expect_eq(%view, %view_4) : !hal.buffer_view
return
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
}
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
util.global private @_device_query_0 : i1
util.global private @_pipeline_layout_0 : !hal.pipeline_layout
util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
%pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
util.global.store %value, @_device_query_0 : i1
util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
cf.cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
%exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
cf.br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
cf.br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
util.initializer.return
}
func.func @split_reduction_pass2() attributes {iree.abi.stub} {
call @_split_reduction_pass2() : () -> ()
return
}
hal.executable private @_split_reduction_pass2_dispatch_0 {
hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment