vmurali/test

## test
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  func.func private @_split_reduction_pass2() {
    %0 = util.unfoldable_constant dense<1> : tensor<512x256x128xi32>
    %c0_i32 = arith.constant 0 : i32
    %1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
    %2 = linalg.fill ins(%c0_i32 : i32) outs(%1 : tensor<512x256xi32>) -> tensor<512x256xi32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %4 = arith.addi %arg0, %arg1 : i32
      linalg.yield %4 : i32
    } -> tensor<512x256xi32>
    check.expect_eq_const(%3, dense<128> : tensor<512x256xi32>) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  func.func private @_split_reduction_pass2() {
    %cst = arith.constant dense<128> : tensor<512x256xi32>
    %cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
    %0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
    %1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
    %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %3 = arith.addi %arg0, %arg1 : i32
      linalg.yield %3 : i32
    } -> tensor<512x256xi32>
    check.expect_eq(%2, %cst) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Inliner (inline) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  func.func private @_split_reduction_pass2() {
    %cst = arith.constant dense<128> : tensor<512x256xi32>
    %cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
    %0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
    %1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
    %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
    ^bb0(%arg0: i32, %arg1: i32):
      %3 = arith.addi %arg0, %arg1 : i32
      linalg.yield %3 : i32
    } -> tensor<512x256xi32>
    check.expect_eq(%2, %cst) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After DispatchLinalgOnTensors (iree-flow-dispatch-linalg-on-tensors-pass) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  func.func private @_split_reduction_pass2() {
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<128> : tensor<512x256xi32>
    %cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
    %0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
    %1 = flow.dispatch.workgroups[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32> =
        (%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) {
      %2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
      %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
      %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
      ^bb0(%arg2: i32, %arg3: i32):
        %5 = arith.addi %arg2, %arg3 : i32
        linalg.yield %5 : i32
      } -> tensor<512x256xi32>
      flow.dispatch.tensor.store %4, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
      flow.return
    } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    check.expect_eq(%1, %cst) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  flow.executable private @_split_reduction_pass2_dispatch_0 {
    flow.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !flow.dispatch.tensor<readonly:512x256x128xi32>, %arg1: !flow.dispatch.tensor<writeonly:512x256xi32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %1 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%0 : tensor<512x256x128xi32>) outs(%1 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %3 = arith.addi %arg2, %arg3 : i32
          linalg.yield %3 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %2, %arg1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<128> : tensor<512x256xi32>
    %cst_0 = arith.constant dense<1> : tensor<512x256x128xi32>
    %0 = util.do_not_optimize(%cst_0) : tensor<512x256x128xi32>
    %1 = flow.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%0) : (tensor<512x256x128xi32>) -> tensor<512x256xi32>
    check.expect_eq(%1, %cst) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ConvertToStream (iree-stream-conversion) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %cst = stream.tensor.constant : tensor<512x256xi32> in !stream.resource<constant> = dense<128> : tensor<512x256xi32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
    %cst_0 = stream.tensor.constant : tensor<512x256x128xi32> in !stream.resource<constant> = dense<1> : tensor<512x256x128xi32>
    %2 = stream.resource.size %cst_0 : !stream.resource<constant>
    %3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2}
    %4 = util.do_not_optimize(%3) : !stream.resource<*>
    %5 = stream.resource.size %4 : !stream.resource<*>
    %6 = stream.tensor.sizeof tensor<512x256xi32> : index
    %7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32>
    %10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
    check.expect_eq(%9, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.tensor.sizeof tensor<512x256xi32> : index
    %1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof tensor<512x256x128xi32> : index
    %3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2}
    %4 = util.do_not_optimize(%3) : !stream.resource<*>
    %5 = stream.resource.size %4 : !stream.resource<*>
    %6 = stream.tensor.sizeof tensor<512x256xi32> : index
    %7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%6} -> tensor<512x256xi32>
    %10 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %11 = stream.tensor.export %10 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
    check.expect_eq(%9, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.tensor.sizeof tensor<512x256xi32> : index
    %1 = stream.tensor.splat %c128_i32 : i32 -> tensor<512x256xi32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof tensor<512x256x128xi32> : index
    %3 = stream.tensor.splat %c1_i32 : i32 -> tensor<512x256x128xi32> in !stream.resource<*>{%2}
    %4 = util.do_not_optimize(%3) : !stream.resource<*>
    %5 = stream.resource.size %4 : !stream.resource<*>
    %6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%4) : (!stream.resource<*>{%5}) -> !stream.resource<*>{%0}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
    %9 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %10 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%0} -> tensor<512x256xi32>
    check.expect_eq(%8, %10) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288}
    %1 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864}
    %2 = util.do_not_optimize(%1) : !stream.resource<*>
    %3 = stream.resource.size %2 : !stream.resource<*>
    %4 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%2) : (!stream.resource<*>{%3}) -> !stream.resource<*>{%c524288}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
    %6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %7 = stream.async.transfer %0 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
    %8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%6, %8) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<*>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<*>
    %2 = stream.resource.size %1 : !stream.resource<*>
    %3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<*>{%2}) -> !stream.resource<*>{%c524288}
    %4 = stream.async.transfer %3 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
    %5 = stream.tensor.export %4 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %6 = stream.async.splat %c128_i32 : i32 -> !stream.resource<*>{%c524288}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c524288} -> !stream.resource<external>{%c524288}
    %8 = stream.tensor.export %7 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%5, %8) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After RefineUsage (iree-stream-refine-usage) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<transient>
    %2 = stream.resource.size %1 : !stream.resource<transient>
    %3 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
    %4 = stream.tensor.export %3 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %5 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
    %6 = stream.tensor.export %5 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%4, %6) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
      %6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
      stream.yield %6 : !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<transient>
    %2 = stream.resource.size %1 : !stream.resource<transient>
    %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
      %6 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg0) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
      %7 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
      stream.yield %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    } => !stream.timepoint
    %3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%4, %5) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
      %6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
      stream.yield %6 : !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<transient>
    %2 = stream.resource.size %1 : !stream.resource<transient>
    %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
      %6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
        %7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
        %8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
        stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
      }
      stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    } => !stream.timepoint
    %3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%4, %5) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
      %8 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
      stream.yield %8 : !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<transient>
    %2 = stream.resource.size %1 : !stream.resource<transient>
    %3 = stream.timepoint.immediate => !stream.timepoint
    %4 = stream.timepoint.immediate => !stream.timepoint
    %results_0:2, %result_timepoint_1 = stream.async.execute await(%4) => with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
      %8:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
        %9 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
        %10 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
        stream.yield %9, %10 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
      }
      stream.yield %8#0, %8#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    } => !stream.timepoint
    %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %6 = stream.tensor.export %5#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %7 = stream.tensor.export %5#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%6, %7) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c67108864} {
      %6 = stream.async.splat %c1_i32 : i32 -> !stream.resource<transient>{%c67108864}
      stream.yield %6 : !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c67108864}
    %1 = util.do_not_optimize(%0) : !stream.resource<transient>
    %2 = stream.resource.size %1 : !stream.resource<transient>
    %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
      %6:2 = stream.async.concurrent with(%arg0 as %arg1: !stream.resource<transient>{%2}) -> (!stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}) {
        %7 = stream.async.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%arg1) : (!stream.resource<transient>{%2}) -> !stream.resource<external>{%c524288}
        %8 = stream.async.splat %c128_i32 : i32 -> !stream.resource<external>{%c524288}
        stream.yield %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
      }
      stream.yield %6#0, %6#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    } => !stream.timepoint
    %3:2 = stream.timepoint.await %result_timepoint_1 => %results_0#0, %results_0#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %4 = stream.tensor.export %3#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %5 = stream.tensor.export %3#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%4, %5) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %c0 = arith.constant 0 : index
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %c0_0 = arith.constant 0 : index
    %5:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5#0 as %arg1: !stream.resource<external>{%c524288}, %5#1 as %arg2: !stream.resource<external>{%c524288}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
        }
        stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
      }
    } => !stream.timepoint
    %7:2 = stream.timepoint.await %6 => %5#0, %5#1 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %8 = stream.tensor.export %7#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %9 = stream.tensor.export %7#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%8, %9) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %c0 = arith.constant 0 : index
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %c0_0 = arith.constant 0 : index
    %5:3 = stream.resource.pack slices({
      [0, 0] = %c524288,
      [0, 0] = %c524288
    }) : index
    %6 = stream.resource.alloc uninitialized : !stream.resource<external>{%5#0}
    %7 = stream.resource.subview %6[%5#1] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288}
    %8 = stream.resource.subview %6[%5#2] : !stream.resource<external>{%5#0} -> !stream.resource<external>{%c524288}
    %9 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %7 as %arg1: !stream.resource<external>{%c524288}, %8 as %arg2: !stream.resource<external>{%c524288}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
        }
        stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
      }
    } => !stream.timepoint
    %10:2 = stream.timepoint.await %9 => %7, %8 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %11 = stream.tensor.export %10#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %12 = stream.tensor.export %10#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%11, %12) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %c0 = arith.constant 0 : index
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %c0_0 = arith.constant 0 : index
    %c0_1 = arith.constant 0 : index
    %c524288_2 = arith.constant 524288 : index
    %c524288_3 = arith.constant 524288 : index
    %c1048576 = arith.constant 1048576 : index
    %c1048576_4 = arith.constant 1048576 : index
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_4}
    %6 = stream.resource.subview %5[%c0_1] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288}
    %7 = stream.resource.subview %5[%c524288_3] : !stream.resource<external>{%c1048576_4} -> !stream.resource<external>{%c524288}
    %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_0 for %c524288] : !stream.resource<external>{%c524288}
        }
        stream.cmd.fill %c128_i32, %arg2[%c0_0 for %c524288] : i32 -> !stream.resource<external>{%c524288}
      }
    } => !stream.timepoint
    %9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %c0_0 = arith.constant 0 : index
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0_0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %c0_1 = arith.constant 0 : index
    %c0_2 = arith.constant 0 : index
    %c524288_3 = arith.constant 524288 : index
    %c524288_4 = arith.constant 524288 : index
    %c1048576 = arith.constant 1048576 : index
    %c1048576_5 = arith.constant 1048576 : index
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576_5}
    %6 = stream.resource.subview %5[%c0_2] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288}
    %7 = stream.resource.subview %5[%c524288_4] : !stream.resource<external>{%c1048576_5} -> !stream.resource<external>{%c524288}
    %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %6 as %arg1: !stream.resource<external>{%c524288}, %7 as %arg2: !stream.resource<external>{%c524288}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0_1 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_1 for %c524288] : !stream.resource<external>{%c524288}
        }
        stream.cmd.fill %c128_i32, %arg2[%c0_1 for %c524288] : i32 -> !stream.resource<external>{%c524288}
      }
    } => !stream.timepoint
    %9:2 = stream.timepoint.await %8 => %6, %7 : !stream.resource<external>{%c524288}, !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %9#0 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9#1 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c524288] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index) {
        %c0 = arith.constant 0 : index
        %0 = arith.addi %c0, %arg2 : index
        %1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %2 = arith.addi %c0, %arg3 : index
        %3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %5 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %6 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<512x256x128xi32>) outs(%5 : tensor<512x256xi32>) {
        ^bb0(%arg4: i32, %arg5: i32):
          %7 = arith.addi %arg4, %arg5 : i32
          linalg.yield %7 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %6, %3, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %c0_0 = arith.constant 0 : index
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0, %c0 : index, index) {
          ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) {
        %0 = arith.index_cast %arg2 : i32 to index
        %1 = arith.index_cast %arg3 : i32 to index
        %c0 = arith.constant 0 : index
        %2 = arith.addi %c0, %0 : index
        %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %4 = arith.addi %c0, %1 : index
        %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
        ^bb0(%arg4: i32, %arg5: i32):
          %9 = arith.addi %arg4, %arg5 : i32
          linalg.yield %9 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %c0_0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i32_1 = arith.constant 0 : i32
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32_1 : i32, i32) {
          ro %arg0[%c0_0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0_0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: i32, %arg3: i32) {
        %0 = arith.index_cast %arg2 : i32 to index
        %1 = arith.index_cast %arg3 : i32 to index
        %c0 = arith.constant 0 : index
        %2 = arith.addi %c0, %0 : index
        %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %4 = arith.addi %c0, %1 : index
        %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
        ^bb0(%arg4: i32, %arg5: i32):
          %9 = arith.addi %arg4, %arg5 : i32
          linalg.yield %9 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %c0_i32 = arith.constant 0 : i32
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1](%c0_i32, %c0_i32 : i32, i32) {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding, %arg1: !stream.binding) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %c0 = arith.constant 0 : index
        %2 = arith.addi %c0, %0 : index
        %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %4 = arith.addi %c0, %1 : index
        %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %9 = arith.addi %arg2, %arg3 : i32
          linalg.yield %9 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %c0_i32 = arith.constant 0 : i32
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %c0 = arith.constant 0 : index
        %2 = arith.addi %c0, %0 : index
        %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %4 = arith.addi %c0, %1 : index
        %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %9 = arith.addi %arg2, %arg3 : i32
          linalg.yield %9 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %c0_i32 = arith.constant 0 : i32
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %c0 = arith.constant 0 : index
        %2 = arith.addi %c0, %0 : index
        %3 = stream.binding.subspan %arg0[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %4 = arith.addi %c0, %1 : index
        %5 = stream.binding.subspan %arg1[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %6 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %7 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %8 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<512x256x128xi32>) outs(%7 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %9 = arith.addi %arg2, %arg3 : i32
          linalg.yield %9 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %8, %5, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  stream.executable private @_split_reduction_pass2_dispatch_0 {
    stream.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:512x256x128xi32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:512x256xi32>
        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
        %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
        %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
        ^bb0(%arg2: i32, %arg3: i32):
          %5 = arith.addi %arg2, %arg3 : i32
          linalg.yield %5 : i32
        } -> tensor<512x256xi32>
        flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
        return
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        }
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [512, 256, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<512x256x128xi32>
          %3 = linalg.init_tensor [512, 256] : tensor<512x256xi32>
          %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<512x256x128xi32>) outs(%3 : tensor<512x256xi32>) {
          ^bb0(%arg0: i32, %arg1: i32):
            %5 = arith.addi %arg0, %arg1 : i32
            linalg.yield %5 : i32
          } -> tensor<512x256xi32>
          flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : tensor<512x256xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          scf.for %arg0 = %2 to %c512 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_x]
            %5 = affine.apply #map0()[%workgroup_count_x]
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
              %7 = linalg.init_tensor [32, 32] : tensor<32x32xi32>
              %8 = tensor.cast %6 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
              %9 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs =  {lowering_config = #config} {
              ^bb0(%arg2: i32, %arg3: i32):
                %11 = arith.addi %arg2, %arg3 : i32
                linalg.yield %11 : i32
              } -> tensor<32x32xi32>
              %10 = tensor.cast %9 : tensor<32x32xi32> to tensor<?x?xi32>
              flow.dispatch.tensor.store %10, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          scf.for %arg0 = %2 to %c512 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_x]
            %5 = affine.apply #map0()[%workgroup_count_x]
            scf.for %arg1 = %4 to %c256 step %5 {
              %c32_0 = arith.constant 32 : index
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32_0, %c32_0], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32>
              %7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32>
              %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
              %9 = linalg.init_tensor [32, 32] : tensor<32x32xi32>
              %10 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
              %11 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs =  {lowering_config = #config} {
              ^bb0(%arg2: i32, %arg3: i32):
                %13 = arith.addi %arg2, %arg3 : i32
                linalg.yield %13 : i32
              } -> tensor<32x32xi32>
              %12 = tensor.cast %11 : tensor<32x32xi32> to tensor<?x?xi32>
              flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          scf.for %arg0 = %2 to %c512 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_x]
            %5 = affine.apply #map0()[%workgroup_count_x]
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<?x?xi32>
              %7 = tensor.cast %6 : tensor<?x?xi32> to tensor<32x32xi32>
              %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c32, %c32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<?x?x128xi32>
              %9 = tensor.cast %8 : tensor<?x?x128xi32> to tensor<32x32x128xi32>
              %10 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9 : tensor<32x32x128xi32>) outs(%7 : tensor<32x32xi32>) attrs =  {lowering_config = #config} {
              ^bb0(%arg2: i32, %arg3: i32):
                %12 = arith.addi %arg2, %arg3 : i32
                linalg.yield %12 : i32
              } -> tensor<32x32xi32>
              %11 = tensor.cast %10 : tensor<32x32xi32> to tensor<?x?xi32>
              flow.dispatch.tensor.store %11, %1, offsets = [%arg0, %arg1], sizes = [%c32, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          scf.for %arg0 = %2 to %c512 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_x]
            %5 = affine.apply #map0()[%workgroup_count_x]
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<32x32x128xi32>) outs(%6 : tensor<32x32xi32>) attrs =  {lowering_config = #config} {
              ^bb0(%arg2: i32, %arg3: i32):
                %9 = arith.addi %arg2, %arg3 : i32
                linalg.yield %9 : i32
              } -> tensor<32x32xi32>
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          scf.for %arg0 = %2 to %c512 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_x]
            %5 = affine.apply #map0()[%workgroup_count_x]
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #config} {
                  ^bb0(%arg6: i32, %arg7: i32):
                    %14 = arith.addi %arg6, %arg7 : i32
                    linalg.yield %14 : i32
                  } -> tensor<1x4xi32>
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs =  {lowering_config = #config} {
                  ^bb0(%arg6: i32, %arg7: i32):
                    %14 = arith.addi %arg6, %arg7 : i32
                    linalg.yield %14 : i32
                  } -> tensor<1x4xi32>
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%10 : tensor<1x4x128xi32>) outs(%11 : tensor<1x4xi32>) attrs =  {lowering_config = #config} {
                  ^bb0(%arg6: i32, %arg7: i32):
                    %14 = arith.addi %arg6, %arg7 : i32
                    linalg.yield %14 : i32
                  } -> tensor<1x4xi32>
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
                    %14 = tensor.extract_slice %10[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x4x4xi32>
                    %15 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14 : tensor<1x4x4xi32>) outs(%arg7 : tensor<1x4xi32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #config} {
                    ^bb0(%arg8: i32, %arg9: i32):
                      %16 = arith.addi %arg8, %arg9 : i32
                      linalg.yield %16 : i32
                    } -> tensor<1x4xi32>
                    scf.yield %15 : tensor<1x4xi32>
                  }
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
                    %14 = tensor.extract_slice %10[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x4x4xi32>
                    %15 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14 : tensor<1x4x4xi32>) outs(%arg7 : tensor<1x4xi32>) attrs =  {lowering_config = #config} {
                    ^bb0(%arg8: i32, %arg9: i32):
                      %16 = arith.addi %arg8, %arg9 : i32
                      linalg.yield %16 : i32
                    } -> tensor<1x4xi32>
                    scf.yield %15 : tensor<1x4xi32>
                  }
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 0], [1, 4, 0], [0, 0, 4]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<1x4x128xi32>
                  %11 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<1x4xi32>
                  %12 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %11) -> (tensor<1x4xi32>) {
                    %14 = tensor.extract_slice %10[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<1x4x4xi32>
                    %15 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14 : tensor<1x4x4xi32>) outs(%arg7 : tensor<1x4xi32>) attrs =  {lowering_config = #config} {
                    ^bb0(%arg8: i32, %arg9: i32):
                      %16 = arith.addi %arg8, %arg9 : i32
                      linalg.yield %16 : i32
                    } -> tensor<1x4xi32>
                    scf.yield %15 : tensor<1x4xi32>
                  }
                  %13 = tensor.insert_slice %12 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xi32> into tensor<32x32xi32>
                  scf.yield %13 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgSplitReduction (linalg-split-reduction) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<(d0) -> (d0)>
#map2 = affine_map<(d0) -> ()>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map0()[%workgroup_id_y]
          %3 = affine.apply #map0()[%workgroup_count_y]
          %4 = affine.apply #map0()[%workgroup_id_x]
          %5 = affine.apply #map0()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %11 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %13 = tensor.expand_shape %12 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %14 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %13) -> (tensor<1x4xi32>) {
                    %17 = tensor.extract_slice %11[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %18 = tensor.expand_shape %17 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %19 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %20 = scf.for %arg10 = %c0 to %c4 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x4xi32>) {
                        %21 = tensor.extract_slice %18[%arg8, %arg10, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                        %22 = tensor.extract_slice %arg11[%arg8, %arg10] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                        %23 = tensor.expand_shape %21 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                        %24 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
                        %25 = linalg.fill ins(%c0_i32 : i32) outs(%24 : tensor<1x1x4xi32>) -> tensor<1x1x4xi32>
                        %26 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %25) -> (tensor<1x1x4xi32>) {
                          %33 = tensor.extract_slice %23[0, 0, %arg12, 0] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xi32> to tensor<4xi32>
                          %34 = tensor.collapse_shape %arg13 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
                          %35 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%33 : tensor<4xi32>) outs(%34 : tensor<4xi32>) {
                          ^bb0(%arg14: i32, %arg15: i32):
                            %37 = arith.addi %arg14, %arg15 : i32
                            linalg.yield %37 : i32
                          } -> tensor<4xi32>
                          %36 = tensor.expand_shape %35 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32>
                          scf.yield %36 : tensor<1x1x4xi32>
                        }
                        %27 = tensor.collapse_shape %26 [[0], [1, 2]] : tensor<1x1x4xi32> into tensor<1x4xi32>
                        %28 = linalg.init_tensor [4] : tensor<4xi32>
                        %29 = linalg.fill ins(%c0_i32 : i32) outs(%28 : tensor<4xi32>) -> tensor<4xi32>
                        %30 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %29) -> (tensor<4xi32>) {
                          %33 = tensor.extract_slice %27[%arg12, 0] [1, 4] [1, 1] : tensor<1x4xi32> to tensor<4xi32>
                          %34 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%33 : tensor<4xi32>) outs(%arg13 : tensor<4xi32>) {
                          ^bb0(%arg14: i32, %arg15: i32):
                            %35 = arith.addi %arg14, %arg15 : i32
                            linalg.yield %35 : i32
                          } -> tensor<4xi32>
                          scf.yield %34 : tensor<4xi32>
                        }
                        %31 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["reduction"]} ins(%30 : tensor<4xi32>) outs(%22 : tensor<i32>) {
                        ^bb0(%arg12: i32, %arg13: i32):
                          %33 = arith.addi %arg12, %arg13 : i32
                          linalg.yield %33 : i32
                        } -> tensor<i32>
                        %32 = tensor.insert_slice %31 into %arg11[%arg8, %arg10] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                        scf.yield %32 : tensor<1x4xi32>
                      }
                      scf.yield %20 : tensor<1x4xi32>
                    }
                    scf.yield %19 : tensor<1x4xi32>
                  }
                  %15 = tensor.collapse_shape %14 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %16 = tensor.insert_slice %15 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %16 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyVectorizePass (iree-linalg-strategy-vectorize-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %cst_0 = arith.constant dense<0> : vector<4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %6 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %8 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %6) -> (tensor<32x32xi32>) {
                %9 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %10 = tensor.extract_slice %7[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %11 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %13 = tensor.expand_shape %12 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %14 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %13) -> (tensor<1x4xi32>) {
                    %17 = tensor.extract_slice %11[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %18 = tensor.expand_shape %17 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %19 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %20 = scf.for %arg10 = %c0 to %c4 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x4xi32>) {
                        %21 = tensor.extract_slice %18[%arg8, %arg10, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                        %22 = tensor.extract_slice %arg11[%arg8, %arg10] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                        %23 = tensor.expand_shape %21 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                        %24 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
                        %25 = vector.transfer_write %cst, %24[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
                        %26 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %25) -> (tensor<1x1x4xi32>) {
                          %38 = tensor.collapse_shape %arg13 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
                          %39 = vector.transfer_read %23[%c0, %c0, %arg12, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x1x4xi32>, vector<4xi32>
                          %40 = vector.transfer_read %38[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
                          %41 = arith.addi %39, %40 : vector<4xi32>
                          %42 = vector.transfer_write %41, %38[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                          %43 = tensor.expand_shape %42 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32>
                          scf.yield %43 : tensor<1x1x4xi32>
                        }
                        %27 = tensor.collapse_shape %26 [[0], [1, 2]] : tensor<1x1x4xi32> into tensor<1x4xi32>
                        %28 = linalg.init_tensor [4] : tensor<4xi32>
                        %29 = vector.transfer_write %cst_0, %28[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                        %30 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %29) -> (tensor<4xi32>) {
                          %38 = vector.transfer_read %27[%arg12, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x4xi32>, vector<4xi32>
                          %39 = vector.transfer_read %arg13[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
                          %40 = arith.addi %38, %39 : vector<4xi32>
                          %41 = vector.transfer_write %40, %arg13[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                          scf.yield %41 : tensor<4xi32>
                        }
                        %31 = vector.transfer_read %30[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
                        %32 = vector.transfer_read %22[], %c0_i32 : tensor<i32>, vector<i32>
                        %33 = vector.extractelement %32[] : vector<i32>
                        %34 = vector.multi_reduction <add>, %31, %33 [0] : vector<4xi32> to i32
                        %35 = vector.broadcast %34 : i32 to vector<i32>
                        %36 = vector.transfer_write %35, %22[] : vector<i32>, tensor<i32>
                        %37 = tensor.insert_slice %36 into %arg11[%arg8, %arg10] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                        scf.yield %37 : tensor<1x4xi32>
                      }
                      scf.yield %20 : tensor<1x4xi32>
                    }
                    scf.yield %19 : tensor<1x4xi32>
                  }
                  %15 = tensor.collapse_shape %14 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %16 = tensor.insert_slice %15 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %16 : tensor<32x32xi32>
                }
                scf.yield %9 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          %6 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
          %7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
          %8 = tensor.collapse_shape %7 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
          %9 = vector.transfer_read %8[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %12 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %10) -> (tensor<32x32xi32>) {
                %13 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %14 = tensor.extract_slice %11[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %15 = tensor.expand_shape %14 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %16 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %17 = tensor.expand_shape %16 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %18 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %17) -> (tensor<1x4xi32>) {
                    %21 = tensor.extract_slice %15[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %22 = tensor.expand_shape %21 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %23 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %24 = tensor.extract_slice %22[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                      %25 = tensor.extract_slice %arg9[0, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                      %26 = tensor.expand_shape %24 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                      %27 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x1x4xi32>, vector<4xi32>
                      %28 = arith.addi %27, %9 : vector<4xi32>
                      %29 = vector.transfer_write %28, %8[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                      %30 = tensor.expand_shape %29 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32>
                      %31 = tensor.collapse_shape %30 [[0], [1, 2]] : tensor<1x1x4xi32> into tensor<1x4xi32>
                      %32 = vector.transfer_read %31[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x4xi32>, vector<4xi32>
                      %33 = vector.transfer_read %25[], %c0_i32 : tensor<i32>, vector<i32>
                      %34 = vector.extractelement %33[] : vector<i32>
                      %35 = vector.multi_reduction <add>, %32, %34 [0] : vector<4xi32> to i32
                      %36 = vector.broadcast %35 : i32 to vector<i32>
                      %37 = vector.transfer_write %36, %25[] : vector<i32>, tensor<i32>
                      %38 = tensor.insert_slice %37 into %arg9[0, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                      scf.yield %38 : tensor<1x4xi32>
                    }
                    scf.yield %23 : tensor<1x4xi32>
                  }
                  %19 = tensor.collapse_shape %18 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %20 = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %20 : tensor<32x32xi32>
                }
                scf.yield %13 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          %6 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
          %7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
          %8 = tensor.collapse_shape %7 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
          %9 = vector.transfer_read %8[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %12 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %10) -> (tensor<32x32xi32>) {
                %13 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %14 = tensor.extract_slice %11[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %15 = tensor.expand_shape %14 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %16 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %17 = tensor.expand_shape %16 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %18 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %17) -> (tensor<1x4xi32>) {
                    %21 = tensor.extract_slice %15[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %22 = tensor.expand_shape %21 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %23 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %24 = tensor.extract_slice %22[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                      %25 = tensor.extract_slice %arg9[0, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                      %26 = tensor.expand_shape %24 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                      %27 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x1x4xi32>, vector<4xi32>
                      %28 = arith.addi %27, %9 : vector<4xi32>
                      %29 = vector.transfer_write %28, %8[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                      %30 = tensor.expand_shape %29 [[0, 1, 2]] : tensor<4xi32> into tensor<1x1x4xi32>
                      %31 = tensor.collapse_shape %30 [[0], [1, 2]] : tensor<1x1x4xi32> into tensor<1x4xi32>
                      %32 = vector.transfer_read %31[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x4xi32>, vector<4xi32>
                      %33 = vector.transfer_read %25[], %c0_i32 : tensor<i32>, vector<i32>
                      %34 = vector.extractelement %33[] : vector<i32>
                      %35 = vector.multi_reduction <add>, %32, %34 [0] : vector<4xi32> to i32
                      %36 = vector.broadcast %35 : i32 to vector<i32>
                      %37 = vector.transfer_write %36, %25[] : vector<i32>, tensor<i32>
                      %38 = tensor.insert_slice %37 into %arg9[0, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                      scf.yield %38 : tensor<1x4xi32>
                    }
                    scf.yield %23 : tensor<1x4xi32>
                  }
                  %19 = tensor.collapse_shape %18 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %20 = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %20 : tensor<32x32xi32>
                }
                scf.yield %13 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          %6 = linalg.init_tensor [1, 1, 4] : tensor<1x1x4xi32>
          %7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
          %8 = tensor.collapse_shape %7 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
          %9 = vector.transfer_read %8[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %12 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %10) -> (tensor<32x32xi32>) {
                %13 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %14 = tensor.extract_slice %11[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %15 = tensor.expand_shape %14 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %16 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %17 = tensor.expand_shape %16 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %18 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %17) -> (tensor<1x4xi32>) {
                    %21 = tensor.extract_slice %15[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %22 = tensor.expand_shape %21 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %23 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %24 = tensor.extract_slice %22[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                      %25 = tensor.extract_slice %arg9[0, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                      %26 = tensor.expand_shape %24 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                      %27 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x1x4xi32>, vector<4xi32>
                      %28 = arith.addi %27, %9 : vector<4xi32>
                      %29 = vector.transfer_write %28, %8[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                      %30 = tensor.expand_shape %29 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                      %31 = vector.transfer_read %30[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x4xi32>, vector<4xi32>
                      %32 = vector.transfer_read %25[], %c0_i32 : tensor<i32>, vector<i32>
                      %33 = vector.extractelement %32[] : vector<i32>
                      %34 = vector.multi_reduction <add>, %31, %33 [0] : vector<4xi32> to i32
                      %35 = vector.broadcast %34 : i32 to vector<i32>
                      %36 = vector.transfer_write %35, %25[] : vector<i32>, tensor<i32>
                      %37 = tensor.insert_slice %36 into %arg9[0, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                      scf.yield %37 : tensor<1x4xi32>
                    }
                    scf.yield %23 : tensor<1x4xi32>
                  }
                  %19 = tensor.collapse_shape %18 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %20 = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %20 : tensor<32x32xi32>
                }
                scf.yield %13 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgInitTensorToAllocTensor (linalg-init-tensor-to-alloc-tensor) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          %6 = bufferization.alloc_tensor() : tensor<1x1x4xi32>
          %7 = vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
          %8 = tensor.collapse_shape %7 [[0, 1, 2]] : tensor<1x1x4xi32> into tensor<4xi32>
          %9 = vector.transfer_read %8[%c0], %c0_i32 {in_bounds = [true]} : tensor<4xi32>, vector<4xi32>
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:512x256xi32> -> tensor<32x32xi32>
              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [32, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:512x256x128xi32> -> tensor<32x32x128xi32>
              %12 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %10) -> (tensor<32x32xi32>) {
                %13 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<32x32xi32>) {
                  %14 = tensor.extract_slice %11[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : tensor<32x32x128xi32> to tensor<4x128xi32>
                  %15 = tensor.expand_shape %14 [[0, 1], [2]] : tensor<4x128xi32> into tensor<1x4x128xi32>
                  %16 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<32x32xi32> to tensor<4xi32>
                  %17 = tensor.expand_shape %16 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                  %18 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %17) -> (tensor<1x4xi32>) {
                    %21 = tensor.extract_slice %15[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : tensor<1x4x128xi32> to tensor<4x4xi32>
                    %22 = tensor.expand_shape %21 [[0, 1], [2]] : tensor<4x4xi32> into tensor<1x4x4xi32>
                    %23 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x4xi32>) {
                      %24 = tensor.extract_slice %22[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4x4xi32> to tensor<4xi32>
                      %25 = tensor.extract_slice %arg9[0, %arg8] [1, 1] [1, 1] : tensor<1x4xi32> to tensor<i32>
                      %26 = tensor.expand_shape %24 [[0, 1, 2, 3]] : tensor<4xi32> into tensor<1x1x1x4xi32>
                      %27 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x1x1x4xi32>, vector<4xi32>
                      %28 = arith.addi %27, %9 : vector<4xi32>
                      %29 = vector.transfer_write %28, %8[%c0] {in_bounds = [true]} : vector<4xi32>, tensor<4xi32>
                      %30 = tensor.expand_shape %29 [[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
                      %31 = vector.transfer_read %30[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x4xi32>, vector<4xi32>
                      %32 = vector.transfer_read %25[], %c0_i32 : tensor<i32>, vector<i32>
                      %33 = vector.extractelement %32[] : vector<i32>
                      %34 = vector.multi_reduction <add>, %31, %33 [0] : vector<4xi32> to i32
                      %35 = vector.broadcast %34 : i32 to vector<i32>
                      %36 = vector.transfer_write %35, %25[] : vector<i32>, tensor<i32>
                      %37 = tensor.insert_slice %36 into %arg9[0, %arg8] [1, 1] [1, 1] : tensor<i32> into tensor<1x4xi32>
                      scf.yield %37 : tensor<1x4xi32>
                    }
                    scf.yield %23 : tensor<1x4xi32>
                  }
                  %19 = tensor.collapse_shape %18 [[0, 1]] : tensor<1x4xi32> into tensor<4xi32>
                  %20 = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<4xi32> into tensor<32x32xi32>
                  scf.yield %20 : tensor<32x32xi32>
                }
                scf.yield %13 : tensor<32x32xi32>
              }
              flow.dispatch.tensor.store %12, %1, offsets = [%arg0, %arg1], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !flow.dispatch.tensor<writeonly:512x256xi32>
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<() -> ()>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %2, 64 : memref<512x256xi32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map0()[%workgroup_id_y]
          %5 = affine.apply #map0()[%workgroup_count_y]
          %6 = affine.apply #map0()[%workgroup_id_x]
          %7 = affine.apply #map0()[%workgroup_count_x]
          %8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          %9 = bufferization.to_tensor %8 : memref<1x1x4xi32>
          vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %10 = bufferization.to_tensor %8 : memref<1x1x4xi32>
          %11 = memref.collapse_shape %8 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %12 = bufferization.to_tensor %11 : memref<4xi32>
          %13 = vector.transfer_read %11[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %4 to %c512 step %5 {
            scf.for %arg1 = %6 to %c256 step %7 {
              %14 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %15 = bufferization.to_tensor %14 : memref<32x32xi32, strided<[256, 1], offset: ?>>
              %16 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              %17 = bufferization.to_tensor %16 : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              %18 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %14) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
                %21 = bufferization.to_tensor %arg3 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                %22 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
                  %24 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                  %25 = memref.subview %16[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %26 = bufferization.to_tensor %25 : memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %27 = memref.expand_shape %25 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %28 = bufferization.to_tensor %27 : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %29 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %30 = bufferization.to_tensor %29 : memref<4xi32, strided<[1], offset: ?>>
                  %31 = memref.expand_shape %29 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  %32 = bufferization.to_tensor %31 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                  %33 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %31) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) {
                    %39 = bufferization.to_tensor %arg7 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                    %40 = memref.subview %27[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %41 = bufferization.to_tensor %40 : memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %42 = memref.expand_shape %40 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    %43 = bufferization.to_tensor %42 : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    %44 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) {
                      %46 = bufferization.to_tensor %arg9 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                      %47 = memref.subview %42[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %48 = bufferization.to_tensor %47 : memref<4xi32, strided<[1], offset: ?>>
                      %49 = memref.subview %arg9[0, %arg8] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %50 = bufferization.to_tensor %49 : memref<i32, strided<[], offset: ?>>
                      %51 = memref.expand_shape %47 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %52 = bufferization.to_tensor %51 : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %53 = vector.transfer_read %51[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %54 = arith.addi %53, %13 : vector<4xi32>
                      vector.transfer_write %54, %11[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %55 = bufferization.to_tensor %11 : memref<4xi32>
                      %56 = memref.expand_shape %11 [[0, 1]] : memref<4xi32> into memref<1x4xi32>
                      %57 = bufferization.to_tensor %56 : memref<1x4xi32>
                      %58 = vector.transfer_read %56[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %59 = vector.transfer_read %49[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %60 = vector.extractelement %59[] : vector<i32>
                      %61 = vector.multi_reduction <add>, %58, %60 [0] : vector<4xi32> to i32
                      %62 = vector.broadcast %61 : i32 to vector<i32>
                      vector.transfer_write %62, %49[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                      %63 = bufferization.to_tensor %49 : memref<i32, strided<[], offset: ?>>
                      %64 = memref.subview %arg9[0, %arg8] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      linalg.generic {indexing_maps = [#map1, #map1], iterator_types = []} ins(%49 : memref<i32, strided<[], offset: ?>>) outs(%64 : memref<i32, strided<[], offset: ?>>) {
                      ^bb0(%arg10: i32, %arg11: i32):
                        linalg.yield %arg10 : i32
                      }
                      %65 = bufferization.to_tensor %arg9 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                      scf.yield %arg9 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                    }
                    %45 = bufferization.to_tensor %44 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                    scf.yield %44 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                  }
                  %34 = bufferization.to_tensor %33 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                  %35 = memref.collapse_shape %33 [[0, 1]] : memref<1x4xi32, strided<[4, 1], offset: ?>> into memref<4xi32, strided<[1], offset: ?>>
                  %36 = bufferization.to_tensor %35 : memref<4xi32, strided<[1], offset: ?>>
                  %37 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%35 : memref<4xi32, strided<[1], offset: ?>>) outs(%37 : memref<4xi32, strided<[1], offset: ?>>) {
                  ^bb0(%arg6: i32, %arg7: i32):
                    linalg.yield %arg6 : i32
                  }
                  %38 = bufferization.to_tensor %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                  scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                }
                %23 = bufferization.to_tensor %22 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                scf.yield %22 : memref<32x32xi32, strided<[256, 1], offset: ?>>
              }
              %19 = bufferization.to_tensor %18 : memref<32x32xi32, strided<[256, 1], offset: ?>>
              %20 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%18 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%20 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
              ^bb0(%arg2: i32, %arg3: i32):
                linalg.yield %arg2 : i32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<() -> ()>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %2, 64 : memref<512x256xi32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map0()[%workgroup_id_y]
          %5 = affine.apply #map0()[%workgroup_count_y]
          %6 = affine.apply #map0()[%workgroup_id_x]
          %7 = affine.apply #map0()[%workgroup_count_x]
          %8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %9 = memref.collapse_shape %8 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %10 = vector.transfer_read %9[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %4 to %c512 step %5 {
            scf.for %arg1 = %6 to %c256 step %7 {
              %11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %12 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              %13 = scf.for %arg2 = %c0 to %c32 step %c1 iter_args(%arg3 = %11) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
                %15 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (memref<32x32xi32, strided<[256, 1], offset: ?>>) {
                  %16 = memref.subview %12[%arg2, %arg4, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %17 = memref.expand_shape %16 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %18 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %19 = memref.expand_shape %18 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  %20 = scf.for %arg6 = %c0 to %c128 step %c4 iter_args(%arg7 = %19) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) {
                    %23 = memref.subview %17[0, 0, %arg6] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %24 = memref.expand_shape %23 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    %25 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x4xi32, strided<[4, 1], offset: ?>>) {
                      %26 = memref.subview %24[0, %arg8, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %27 = memref.subview %arg9[0, %arg8] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %28 = memref.expand_shape %26 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %30 = arith.addi %29, %10 : vector<4xi32>
                      vector.transfer_write %30, %9[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %31 = memref.expand_shape %9 [[0, 1]] : memref<4xi32> into memref<1x4xi32>
                      %32 = vector.transfer_read %31[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %33 = vector.transfer_read %27[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %34 = vector.extractelement %33[] : vector<i32>
                      %35 = vector.multi_reduction <add>, %32, %34 [0] : vector<4xi32> to i32
                      %36 = vector.broadcast %35 : i32 to vector<i32>
                      vector.transfer_write %36, %27[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                      %37 = memref.subview %arg9[0, %arg8] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      linalg.generic {indexing_maps = [#map1, #map1], iterator_types = []} ins(%27 : memref<i32, strided<[], offset: ?>>) outs(%37 : memref<i32, strided<[], offset: ?>>) {
                      ^bb0(%arg10: i32, %arg11: i32):
                        linalg.yield %arg10 : i32
                      }
                      scf.yield %arg9 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                    }
                    scf.yield %25 : memref<1x4xi32, strided<[4, 1], offset: ?>>
                  }
                  %21 = memref.collapse_shape %20 [[0, 1]] : memref<1x4xi32, strided<[4, 1], offset: ?>> into memref<4xi32, strided<[1], offset: ?>>
                  %22 = memref.subview %arg5[%arg2, %arg4] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%21 : memref<4xi32, strided<[1], offset: ?>>) outs(%22 : memref<4xi32, strided<[1], offset: ?>>) {
                  ^bb0(%arg6: i32, %arg7: i32):
                    linalg.yield %arg6 : i32
                  }
                  scf.yield %arg5 : memref<32x32xi32, strided<[256, 1], offset: ?>>
                }
                scf.yield %15 : memref<32x32xi32, strided<[256, 1], offset: ?>>
              }
              %14 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%13 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%14 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
              ^bb0(%arg2: i32, %arg3: i32):
                linalg.yield %arg2 : i32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<() -> ()>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %2, 64 : memref<512x256xi32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map0()[%workgroup_id_y]
          %5 = affine.apply #map0()[%workgroup_count_y]
          %6 = affine.apply #map0()[%workgroup_id_x]
          %7 = affine.apply #map0()[%workgroup_count_x]
          %8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %9 = memref.collapse_shape %8 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %10 = vector.transfer_read %9[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %4 to %c512 step %5 {
            scf.for %arg1 = %6 to %c256 step %7 {
              %11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %12 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c32 step %c1 {
                scf.for %arg3 = %c0 to %c32 step %c4 {
                  %14 = memref.subview %12[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %16 = memref.subview %11[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.expand_shape %16 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  scf.for %arg4 = %c0 to %c128 step %c4 {
                    %19 = memref.subview %15[0, 0, %arg4] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %20 = memref.expand_shape %19 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    scf.for %arg5 = %c0 to %c4 step %c1 {
                      %21 = memref.subview %20[0, %arg5, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %22 = memref.subview %17[0, %arg5] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %23 = memref.expand_shape %21 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %24 = vector.transfer_read %23[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %25 = arith.addi %24, %10 : vector<4xi32>
                      vector.transfer_write %25, %9[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %26 = memref.collapse_shape %8 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
                      %27 = vector.transfer_read %26[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %28 = vector.transfer_read %22[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %29 = vector.extractelement %28[] : vector<i32>
                      %30 = vector.multi_reduction <add>, %27, %29 [0] : vector<4xi32> to i32
                      %31 = vector.broadcast %30 : i32 to vector<i32>
                      vector.transfer_write %31, %22[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                      %32 = memref.subview %17[0, %arg5] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      linalg.generic {indexing_maps = [#map1, #map1], iterator_types = []} ins(%22 : memref<i32, strided<[], offset: ?>>) outs(%32 : memref<i32, strided<[], offset: ?>>) {
                      ^bb0(%arg6: i32, %arg7: i32):
                        linalg.yield %arg6 : i32
                      }
                    }
                  }
                  %18 = memref.subview %11[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%16 : memref<4xi32, strided<[1], offset: ?>>) outs(%18 : memref<4xi32, strided<[1], offset: ?>>) {
                  ^bb0(%arg4: i32, %arg5: i32):
                    linalg.yield %arg4 : i32
                  }
                }
              }
              %13 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%13 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
              ^bb0(%arg2: i32, %arg3: i32):
                linalg.yield %arg2 : i32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 * 32)>
#map1 = affine_map<() -> ()>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %2, 64 : memref<512x256xi32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map0()[%workgroup_id_y]
          %5 = affine.apply #map0()[%workgroup_count_y]
          %6 = affine.apply #map0()[%workgroup_id_x]
          %7 = affine.apply #map0()[%workgroup_count_x]
          %8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %9 = memref.collapse_shape %8 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %10 = vector.transfer_read %9[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %4 to %c512 step %5 {
            scf.for %arg1 = %6 to %c256 step %7 {
              %11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %12 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c32 step %c1 {
                scf.for %arg3 = %c0 to %c32 step %c4 {
                  %13 = memref.subview %12[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %14 = memref.expand_shape %13 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %15 = memref.subview %11[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %16 = memref.expand_shape %15 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  scf.for %arg4 = %c0 to %c128 step %c4 {
                    %17 = memref.subview %14[0, 0, %arg4] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %18 = memref.expand_shape %17 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    scf.for %arg5 = %c0 to %c4 step %c1 {
                      %19 = memref.subview %18[0, %arg5, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %20 = memref.subview %16[0, %arg5] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %21 = memref.expand_shape %19 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %22 = vector.transfer_read %21[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %23 = arith.addi %22, %10 : vector<4xi32>
                      vector.transfer_write %23, %9[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %24 = memref.collapse_shape %8 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
                      %25 = vector.transfer_read %24[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %26 = vector.transfer_read %20[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %27 = vector.extractelement %26[] : vector<i32>
                      %28 = vector.multi_reduction <add>, %25, %27 [0] : vector<4xi32> to i32
                      %29 = vector.broadcast %28 : i32 to vector<i32>
                      vector.transfer_write %29, %20[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                      linalg.generic {indexing_maps = [#map1, #map1], iterator_types = []} ins(%20 : memref<i32, strided<[], offset: ?>>) outs(%20 : memref<i32, strided<[], offset: ?>>) {
                      ^bb0(%arg6: i32, %arg7: i32):
                        linalg.yield %arg6 : i32
                      }
                    }
                  }
                  linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%15 : memref<4xi32, strided<[1], offset: ?>>) outs(%15 : memref<4xi32, strided<[1], offset: ?>>) {
                  ^bb0(%arg4: i32, %arg5: i32):
                    linalg.yield %arg4 : i32
                  }
                }
              }
              linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) outs(%11 : memref<32x32xi32, strided<[256, 1], offset: ?>>) {
              ^bb0(%arg2: i32, %arg3: i32):
                linalg.yield %arg2 : i32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:512x256x128xi32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %2, 64 : memref<512x256xi32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map()[%workgroup_id_y]
          %5 = affine.apply #map()[%workgroup_count_y]
          %6 = affine.apply #map()[%workgroup_id_x]
          %7 = affine.apply #map()[%workgroup_count_x]
          %8 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %9 = memref.collapse_shape %8 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %10 = vector.transfer_read %9[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %4 to %c512 step %5 {
            scf.for %arg1 = %6 to %c256 step %7 {
              %11 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %12 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c32 step %c1 {
                scf.for %arg3 = %c0 to %c32 step %c4 {
                  %13 = memref.subview %12[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %14 = memref.expand_shape %13 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %15 = memref.subview %11[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %16 = memref.expand_shape %15 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  scf.for %arg4 = %c0 to %c128 step %c4 {
                    %17 = memref.subview %14[0, 0, %arg4] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %18 = memref.expand_shape %17 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    scf.for %arg5 = %c0 to %c4 step %c1 {
                      %19 = memref.subview %18[0, %arg5, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %20 = memref.subview %16[0, %arg5] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %21 = memref.expand_shape %19 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %22 = vector.transfer_read %21[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %23 = arith.addi %22, %10 : vector<4xi32>
                      vector.transfer_write %23, %9[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %24 = memref.collapse_shape %8 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
                      %25 = vector.transfer_read %24[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %26 = vector.transfer_read %20[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %27 = vector.extractelement %26[] : vector<i32>
                      %28 = vector.multi_reduction <add>, %25, %27 [0] : vector<4xi32> to i32
                      %29 = vector.broadcast %28 : i32 to vector<i32>
                      vector.transfer_write %29, %20[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                    }
                  }
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c512 = arith.constant 512 : index
          %c256 = arith.constant 256 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_count_y]
          %4 = affine.apply #map()[%workgroup_id_x]
          %5 = affine.apply #map()[%workgroup_count_x]
          %6 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %7 = memref.collapse_shape %6 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %8 = vector.transfer_read %7[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          scf.for %arg0 = %2 to %c512 step %3 {
            scf.for %arg1 = %4 to %c256 step %5 {
              %9 = memref.subview %1[%arg0, %arg1] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
              %10 = memref.subview %0[%arg0, %arg1, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c32 step %c1 {
                scf.for %arg3 = %c0 to %c32 step %c4 {
                  %11 = memref.subview %10[%arg2, %arg3, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
                  %12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
                  %13 = memref.subview %9[%arg2, %arg3] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
                  scf.for %arg4 = %c0 to %c128 step %c4 {
                    %15 = memref.subview %12[0, 0, %arg4] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                    %16 = memref.expand_shape %15 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                    scf.for %arg5 = %c0 to %c4 step %c1 {
                      %17 = memref.subview %16[0, %arg5, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                      %18 = memref.subview %14[0, %arg5] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                      %19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                      %20 = vector.transfer_read %19[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                      %21 = arith.addi %20, %8 : vector<4xi32>
                      vector.transfer_write %21, %7[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                      %22 = memref.collapse_shape %6 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
                      %23 = vector.transfer_read %22[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                      %24 = vector.transfer_read %18[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                      %25 = vector.extractelement %24[] : vector<i32>
                      %26 = vector.multi_reduction <add>, %23, %25 [0] : vector<4xi32> to i32
                      %27 = vector.broadcast %26 : i32 to vector<i32>
                      vector.transfer_write %27, %18[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                    }
                  }
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.transfer_read %5[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %9 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %10 = memref.expand_shape %9 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %11 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %12 = memref.expand_shape %11 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %13 = memref.subview %10[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %14 = memref.expand_shape %13 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %15 = memref.subview %14[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %16 = memref.subview %12[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %17 = memref.expand_shape %15 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %18 = vector.transfer_read %17[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %19 = arith.addi %18, %6 : vector<4xi32>
                  vector.transfer_write %19, %5[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                  %20 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
                  %21 = vector.transfer_read %20[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                  %22 = vector.transfer_read %16[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                  %23 = vector.extractelement %22[] : vector<i32>
                  %24 = vector.multi_reduction <add>, %21, %23 [0] : vector<4xi32> to i32
                  %25 = vector.broadcast %24 : i32 to vector<i32>
                  vector.transfer_write %25, %16[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.transfer_read %5[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.transfer_write %20, %5[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                  %21 = vector.transfer_read %9[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                  %22 = vector.transfer_read %17[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                  %23 = vector.extractelement %22[] : vector<i32>
                  %24 = vector.multi_reduction <add>, %21, %23 [0] : vector<4xi32> to i32
                  %25 = vector.broadcast %24 : i32 to vector<i32>
                  vector.transfer_write %25, %17[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.transfer_read %5[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.transfer_write %20, %5[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                  %21 = vector.transfer_read %9[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                  %22 = vector.transfer_read %17[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                  %23 = vector.extractelement %22[] : vector<i32>
                  %24 = vector.multi_reduction <add>, %21, %23 [0] : vector<4xi32> to i32
                  %25 = vector.broadcast %24 : i32 to vector<i32>
                  vector.transfer_write %25, %17[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.transfer_read %5[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.transfer_write %20, %5[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                  %21 = vector.transfer_read %9[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                  %22 = vector.transfer_read %17[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                  %23 = vector.extractelement %22[] : vector<i32>
                  %24 = vector.reduction <add>, %21, %23 : vector<4xi32> into i32
                  %25 = vector.insertelement %24, %cst[%c0 : index] : vector<1xi32>
                  %26 = vector.extract %25[0] : vector<1xi32>
                  %27 = vector.broadcast %26 : i32 to vector<i32>
                  vector.transfer_write %27, %17[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c0_i32 = arith.constant 0 : i32
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.transfer_read %5[%c0], %c0_i32 {in_bounds = [true]} : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.transfer_write %20, %5[%c0] {in_bounds = [true]} : vector<4xi32>, memref<4xi32>
                  %21 = vector.transfer_read %9[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x4xi32>, vector<4xi32>
                  %22 = vector.transfer_read %17[], %c0_i32 : memref<i32, strided<[], offset: ?>>, vector<i32>
                  %23 = vector.extractelement %22[] : vector<i32>
                  %24 = vector.reduction <add>, %21, %23 : vector<4xi32> into i32
                  %25 = vector.insertelement %24, %cst[%c0 : index] : vector<1xi32>
                  %26 = vector.extract %25[0] : vector<1xi32>
                  %27 = vector.broadcast %26 : i32 to vector<i32>
                  vector.transfer_write %27, %17[] : vector<i32>, memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.load %18[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.store %20, %5[%c0] : memref<4xi32>, vector<4xi32>
                  %21 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %22 = memref.load %17[] : memref<i32, strided<[], offset: ?>>
                  %23 = vector.broadcast %22 : i32 to vector<i32>
                  %24 = vector.extractelement %23[] : vector<i32>
                  %25 = vector.reduction <add>, %21, %24 : vector<4xi32> into i32
                  %26 = vector.insertelement %25, %cst[%c0 : index] : vector<1xi32>
                  %27 = vector.extract %26[0] : vector<1xi32>
                  %28 = vector.broadcast %27 : i32 to vector<i32>
                  %29 = vector.extractelement %28[] : vector<i32>
                  memref.store %29, %17[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.transfer_write %cst_0, %4[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, memref<1x1x4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.load %18[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.store %20, %5[%c0] : memref<4xi32>, vector<4xi32>
                  %21 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %22 = memref.load %17[] : memref<i32, strided<[], offset: ?>>
                  %23 = vector.broadcast %22 : i32 to vector<i32>
                  %24 = vector.extractelement %23[] : vector<i32>
                  %25 = vector.reduction <add>, %21, %24 : vector<4xi32> into i32
                  %26 = vector.insertelement %25, %cst[%c0 : index] : vector<1xi32>
                  %27 = vector.extract %26[0] : vector<1xi32>
                  %28 = vector.broadcast %27 : i32 to vector<i32>
                  %29 = vector.extractelement %28[] : vector<i32>
                  memref.store %29, %17[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          %5 = vector.extract %cst_0[0, 0] : vector<1x1x4xi32>
          vector.store %5, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %6 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %7 = vector.load %6[%c0] : memref<4xi32>, vector<4xi32>
          %8 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %9 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %10 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %11 = memref.subview %9[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %13 = memref.subview %8[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %15 = memref.subview %12[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %16 = memref.expand_shape %15 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %17 = memref.subview %16[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %18 = memref.subview %14[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %20 = vector.load %19[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %21 = arith.addi %20, %7 : vector<4xi32>
                  vector.store %21, %6[%c0] : memref<4xi32>, vector<4xi32>
                  %22 = vector.load %10[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %23 = memref.load %18[] : memref<i32, strided<[], offset: ?>>
                  %24 = vector.broadcast %23 : i32 to vector<i32>
                  %25 = vector.extractelement %24[] : vector<i32>
                  %26 = vector.reduction <add>, %22, %25 : vector<4xi32> into i32
                  %27 = vector.insertelement %26, %cst[%c0 : index] : vector<1xi32>
                  %28 = vector.extract %27[0] : vector<1xi32>
                  %29 = vector.broadcast %28 : i32 to vector<i32>
                  %30 = vector.extractelement %29[] : vector<i32>
                  memref.store %30, %18[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant dense<0> : vector<1xi32>
          %cst_0 = arith.constant dense<0> : vector<1x1x4xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          %5 = vector.extract %cst_0[0, 0] : vector<1x1x4xi32>
          vector.store %5, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %6 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %7 = vector.load %6[%c0] : memref<4xi32>, vector<4xi32>
          %8 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %9 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %10 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %11 = memref.subview %9[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %12 = memref.expand_shape %11 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %13 = memref.subview %8[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %14 = memref.expand_shape %13 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %15 = memref.subview %12[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %16 = memref.expand_shape %15 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %17 = memref.subview %16[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %18 = memref.subview %14[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %19 = memref.expand_shape %17 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %20 = vector.load %19[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %21 = arith.addi %20, %7 : vector<4xi32>
                  vector.store %21, %6[%c0] : memref<4xi32>, vector<4xi32>
                  %22 = vector.load %10[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %23 = memref.load %18[] : memref<i32, strided<[], offset: ?>>
                  %24 = vector.broadcast %23 : i32 to vector<i32>
                  %25 = vector.extractelement %24[] : vector<i32>
                  %26 = vector.reduction <add>, %22, %25 : vector<4xi32> into i32
                  %27 = vector.insertelement %26, %cst[%c0 : index] : vector<1xi32>
                  %28 = vector.extract %27[0] : vector<1xi32>
                  %29 = vector.broadcast %28 : i32 to vector<i32>
                  %30 = vector.extractelement %29[] : vector<i32>
                  memref.store %30, %18[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<4xi32>
          %c0 = arith.constant 0 : index
          %cst_0 = arith.constant dense<0> : vector<1xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.load %18[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.store %20, %5[%c0] : memref<4xi32>, vector<4xi32>
                  %21 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %22 = memref.load %17[] : memref<i32, strided<[], offset: ?>>
                  %23 = vector.broadcast %22 : i32 to vector<i32>
                  %24 = vector.extractelement %23[] : vector<i32>
                  %25 = vector.reduction <add>, %21, %24 : vector<4xi32> into i32
                  %26 = vector.insertelement %25, %cst_0[%c0 : index] : vector<1xi32>
                  %27 = vector.extract %26[0] : vector<1xi32>
                  %28 = vector.broadcast %27 : i32 to vector<i32>
                  %29 = vector.extractelement %28[] : vector<i32>
                  memref.store %29, %17[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LLVMCPULowerExecutableTarget (iree-llvmcpu-lower-executable-target) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<4xi32>
          %c0 = arith.constant 0 : index
          %cst_0 = arith.constant dense<0> : vector<1xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          scf.for %arg0 = %c0 to %c32 step %c1 {
            scf.for %arg1 = %c0 to %c32 step %c4 {
              %10 = memref.subview %8[%arg0, %arg1, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
              %11 = memref.expand_shape %10 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
              %12 = memref.subview %7[%arg0, %arg1] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
              %13 = memref.expand_shape %12 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
              scf.for %arg2 = %c0 to %c128 step %c4 {
                %14 = memref.subview %11[0, 0, %arg2] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
                %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
                scf.for %arg3 = %c0 to %c4 step %c1 {
                  %16 = memref.subview %15[0, %arg3, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
                  %17 = memref.subview %13[0, %arg3] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
                  %18 = memref.expand_shape %16 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
                  %19 = vector.load %18[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
                  %20 = arith.addi %19, %6 : vector<4xi32>
                  vector.store %20, %5[%c0] : memref<4xi32>, vector<4xi32>
                  %21 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
                  %22 = memref.load %17[] : memref<i32, strided<[], offset: ?>>
                  %23 = vector.broadcast %22 : i32 to vector<i32>
                  %24 = vector.extractelement %23[] : vector<i32>
                  %25 = vector.reduction <add>, %21, %24 : vector<4xi32> into i32
                  %26 = vector.insertelement %25, %cst_0[%c0 : index] : vector<1xi32>
                  %27 = vector.extract %26[0] : vector<1xi32>
                  %28 = vector.broadcast %27 : i32 to vector<i32>
                  %29 = vector.extractelement %28[] : vector<i32>
                  memref.store %29, %17[] : memref<i32, strided<[], offset: ?>>
                }
              }
            }
          }
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<4xi32>
          %c0 = arith.constant 0 : index
          %cst_0 = arith.constant dense<0> : vector<1xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          cf.br ^bb1(%c0 : index)
        ^bb1(%10: index):  // 2 preds: ^bb0, ^bb11
          %11 = arith.cmpi slt, %10, %c32 : index
          cf.cond_br %11, ^bb2, ^bb12
        ^bb2:  // pred: ^bb1
          cf.br ^bb3(%c0 : index)
        ^bb3(%12: index):  // 2 preds: ^bb2, ^bb10
          %13 = arith.cmpi slt, %12, %c32 : index
          cf.cond_br %13, ^bb4, ^bb11
        ^bb4:  // pred: ^bb3
          %14 = memref.subview %8[%10, %12, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
          %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
          %16 = memref.subview %7[%10, %12] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
          %17 = memref.expand_shape %16 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
          cf.br ^bb5(%c0 : index)
        ^bb5(%18: index):  // 2 preds: ^bb4, ^bb9
          %19 = arith.cmpi slt, %18, %c128 : index
          cf.cond_br %19, ^bb6, ^bb10
        ^bb6:  // pred: ^bb5
          %20 = memref.subview %15[0, 0, %18] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
          %21 = memref.expand_shape %20 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
          cf.br ^bb7(%c0 : index)
        ^bb7(%22: index):  // 2 preds: ^bb6, ^bb8
          %23 = arith.cmpi slt, %22, %c4 : index
          cf.cond_br %23, ^bb8, ^bb9
        ^bb8:  // pred: ^bb7
          %24 = memref.subview %21[0, %22, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
          %25 = memref.subview %17[0, %22] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
          %26 = memref.expand_shape %24 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
          %27 = vector.load %26[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
          %28 = arith.addi %27, %6 : vector<4xi32>
          vector.store %28, %5[%c0] : memref<4xi32>, vector<4xi32>
          %29 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
          %30 = memref.load %25[] : memref<i32, strided<[], offset: ?>>
          %31 = vector.broadcast %30 : i32 to vector<i32>
          %32 = vector.extractelement %31[] : vector<i32>
          %33 = vector.reduction <add>, %29, %32 : vector<4xi32> into i32
          %34 = vector.insertelement %33, %cst_0[%c0 : index] : vector<1xi32>
          %35 = vector.extract %34[0] : vector<1xi32>
          %36 = vector.broadcast %35 : i32 to vector<i32>
          %37 = vector.extractelement %36[] : vector<i32>
          memref.store %37, %25[] : memref<i32, strided<[], offset: ?>>
          %38 = arith.addi %22, %c1 : index
          cf.br ^bb7(%38 : index)
        ^bb9:  // pred: ^bb7
          %39 = arith.addi %18, %c4 : index
          cf.br ^bb5(%39 : index)
        ^bb10:  // pred: ^bb5
          %40 = arith.addi %12, %c4 : index
          cf.br ^bb3(%40 : index)
        ^bb11:  // pred: ^bb3
          %41 = arith.addi %10, %c1 : index
          cf.br ^bb1(%41 : index)
        ^bb12:  // pred: ^bb1
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_split_reduction_pass2_dispatch_0_generic_512x256x128) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<()[s0] -> (s0 * 32)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module {
        func.func @_split_reduction_pass2_dispatch_0_generic_512x256x128() {
          %cst = arith.constant dense<0> : vector<4xi32>
          %c0 = arith.constant 0 : index
          %cst_0 = arith.constant dense<0> : vector<1xi32>
          %c128 = arith.constant 128 : index
          %c4 = arith.constant 4 : index
          %c1 = arith.constant 1 : index
          %c32 = arith.constant 32 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256x128xi32>
          memref.assume_alignment %0, 64 : memref<512x256x128xi32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<512x256xi32>
          memref.assume_alignment %1, 64 : memref<512x256xi32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %2 = affine.apply #map()[%workgroup_id_y]
          %3 = affine.apply #map()[%workgroup_id_x]
          %4 = memref.alloca() {alignment = 128 : i64} : memref<1x1x4xi32>
          vector.store %cst, %4[%c0, %c0, %c0] : memref<1x1x4xi32>, vector<4xi32>
          %5 = memref.collapse_shape %4 [[0, 1, 2]] : memref<1x1x4xi32> into memref<4xi32>
          %6 = vector.load %5[%c0] : memref<4xi32>, vector<4xi32>
          %7 = memref.subview %1[%2, %3] [32, 32] [1, 1] : memref<512x256xi32> to memref<32x32xi32, strided<[256, 1], offset: ?>>
          %8 = memref.subview %0[%2, %3, 0] [32, 32, 128] [1, 1, 1] : memref<512x256x128xi32> to memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>>
          %9 = memref.collapse_shape %4 [[0], [1, 2]] : memref<1x1x4xi32> into memref<1x4xi32>
          cf.br ^bb1(%c0 : index)
        ^bb1(%10: index):  // 2 preds: ^bb0, ^bb10
          %11 = arith.cmpi slt, %10, %c32 : index
          cf.cond_br %11, ^bb2(%c0 : index), ^bb11
        ^bb2(%12: index):  // 2 preds: ^bb1, ^bb9
          %13 = arith.cmpi slt, %12, %c32 : index
          cf.cond_br %13, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %14 = memref.subview %8[%10, %12, 0] [1, 4, 128] [1, 1, 1] : memref<32x32x128xi32, strided<[32768, 128, 1], offset: ?>> to memref<4x128xi32, strided<[128, 1], offset: ?>>
          %15 = memref.expand_shape %14 [[0, 1], [2]] : memref<4x128xi32, strided<[128, 1], offset: ?>> into memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>>
          %16 = memref.subview %7[%10, %12] [1, 4] [1, 1] : memref<32x32xi32, strided<[256, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
          %17 = memref.expand_shape %16 [[0, 1]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x4xi32, strided<[4, 1], offset: ?>>
          cf.br ^bb4(%c0 : index)
        ^bb4(%18: index):  // 2 preds: ^bb3, ^bb8
          %19 = arith.cmpi slt, %18, %c128 : index
          cf.cond_br %19, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %20 = memref.subview %15[0, 0, %18] [1, 4, 4] [1, 1, 1] : memref<1x4x128xi32, strided<[512, 128, 1], offset: ?>> to memref<4x4xi32, strided<[128, 1], offset: ?>>
          %21 = memref.expand_shape %20 [[0, 1], [2]] : memref<4x4xi32, strided<[128, 1], offset: ?>> into memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>>
          cf.br ^bb6(%c0 : index)
        ^bb6(%22: index):  // 2 preds: ^bb5, ^bb7
          %23 = arith.cmpi slt, %22, %c4 : index
          cf.cond_br %23, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %24 = memref.subview %21[0, %22, 0] [1, 1, 4] [1, 1, 1] : memref<1x4x4xi32, strided<[512, 128, 1], offset: ?>> to memref<4xi32, strided<[1], offset: ?>>
          %25 = memref.subview %17[0, %22] [1, 1] [1, 1] : memref<1x4xi32, strided<[4, 1], offset: ?>> to memref<i32, strided<[], offset: ?>>
          %26 = memref.expand_shape %24 [[0, 1, 2, 3]] : memref<4xi32, strided<[1], offset: ?>> into memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>
          %27 = vector.load %26[%c0, %c0, %c0, %c0] : memref<1x1x1x4xi32, strided<[4, 4, 4, 1], offset: ?>>, vector<4xi32>
          %28 = arith.addi %27, %6 : vector<4xi32>
          vector.store %28, %5[%c0] : memref<4xi32>, vector<4xi32>
          %29 = vector.load %9[%c0, %c0] : memref<1x4xi32>, vector<4xi32>
          %30 = memref.load %25[] : memref<i32, strided<[], offset: ?>>
          %31 = vector.broadcast %30 : i32 to vector<i32>
          %32 = vector.extractelement %31[] : vector<i32>
          %33 = vector.reduction <add>, %29, %32 : vector<4xi32> into i32
          %34 = vector.insertelement %33, %cst_0[%c0 : index] : vector<1xi32>
          %35 = vector.extract %34[0] : vector<1xi32>
          %36 = vector.broadcast %35 : i32 to vector<i32>
          %37 = vector.extractelement %36[] : vector<i32>
          memref.store %37, %25[] : memref<i32, strided<[], offset: ?>>
          %38 = arith.addi %22, %c1 : index
          cf.br ^bb6(%38 : index)
        ^bb8:  // pred: ^bb6
          %39 = arith.addi %18, %c4 : index
          cf.br ^bb4(%39 : index)
        ^bb9:  // pred: ^bb4
          %40 = arith.addi %12, %c4 : index
          cf.br ^bb2(%40 : index)
        ^bb10:  // pred: ^bb2
          %41 = arith.addi %10, %c1 : index
          cf.br ^bb1(%41 : index)
        ^bb11:  // pred: ^bb1
          return
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ConvertToLLVM (iree-convert-to-llvm) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
          %0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %1 = llvm.mlir.constant(0 : index) : i64
          %2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %3 = llvm.mlir.constant(128 : index) : i64
          %4 = llvm.mlir.constant(4 : index) : i64
          %5 = llvm.mlir.constant(1 : index) : i64
          %6 = llvm.mlir.constant(32 : index) : i64
          %7 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %8 = llvm.extractvalue %7[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.load %8 : !llvm.ptr<ptr<i8>>
          %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %12 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %17 = llvm.mlir.constant(512 : index) : i64
          %18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %19 = llvm.mlir.constant(32768 : index) : i64
          %20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %21 = llvm.mlir.constant(256 : index) : i64
          %22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %23 = llvm.mlir.constant(128 : index) : i64
          %24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %25 = llvm.mlir.constant(128 : index) : i64
          %26 = llvm.insertvalue %25, %24[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %27 = llvm.mlir.constant(1 : index) : i64
          %28 = llvm.insertvalue %27, %26[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %29 = llvm.extractvalue %28[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %30 = llvm.mlir.constant(0 : index) : i64
          %31 = llvm.mlir.constant(63 : index) : i64
          %32 = llvm.ptrtoint %29 : !llvm.ptr<i32> to i64
          %33 = llvm.and %32, %31  : i64
          %34 = llvm.icmp "eq" %33, %30 : i64
          "llvm.intr.assume"(%34) : (i1) -> ()
          %35 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %36 = llvm.extractvalue %35[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %37 = llvm.mlir.constant(1 : i64) : i64
          %38 = llvm.getelementptr %36[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %39 = llvm.load %38 : !llvm.ptr<ptr<i8>>
          %40 = llvm.bitcast %39 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %41 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %42 = llvm.insertvalue %40, %41[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %43 = llvm.insertvalue %40, %42[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %44 = llvm.mlir.constant(0 : index) : i64
          %45 = llvm.insertvalue %44, %43[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %46 = llvm.mlir.constant(512 : index) : i64
          %47 = llvm.insertvalue %46, %45[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %48 = llvm.mlir.constant(256 : index) : i64
          %49 = llvm.insertvalue %48, %47[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %50 = llvm.mlir.constant(256 : index) : i64
          %51 = llvm.insertvalue %50, %49[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %52 = llvm.mlir.constant(1 : index) : i64
          %53 = llvm.insertvalue %52, %51[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %54 = llvm.extractvalue %53[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %55 = llvm.mlir.constant(0 : index) : i64
          %56 = llvm.mlir.constant(63 : index) : i64
          %57 = llvm.ptrtoint %54 : !llvm.ptr<i32> to i64
          %58 = llvm.and %57, %56  : i64
          %59 = llvm.icmp "eq" %58, %55 : i64
          "llvm.intr.assume"(%59) : (i1) -> ()
          %60 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %61 = llvm.extractvalue %60[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %62 = llvm.zext %61 : i32 to i64
          %63 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %64 = llvm.extractvalue %63[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %65 = llvm.zext %64 : i32 to i64
          %66 = llvm.mlir.constant(32 : index) : i64
          %67 = llvm.mul %65, %66  : i64
          %68 = llvm.mlir.constant(32 : index) : i64
          %69 = llvm.mul %62, %68  : i64
          %70 = llvm.mlir.constant(1 : index) : i64
          %71 = llvm.mlir.constant(1 : index) : i64
          %72 = llvm.mlir.constant(4 : index) : i64
          %73 = llvm.mlir.constant(1 : index) : i64
          %74 = llvm.mlir.constant(4 : index) : i64
          %75 = llvm.mlir.constant(4 : index) : i64
          %76 = llvm.mlir.null : !llvm.ptr<i32>
          %77 = llvm.getelementptr %76[%75] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %78 = llvm.ptrtoint %77 : !llvm.ptr<i32> to i64
          %79 = llvm.alloca %78 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %80 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %81 = llvm.insertvalue %79, %80[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %82 = llvm.insertvalue %79, %81[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %83 = llvm.mlir.constant(0 : index) : i64
          %84 = llvm.insertvalue %83, %82[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %85 = llvm.insertvalue %70, %84[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %86 = llvm.insertvalue %71, %85[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %87 = llvm.insertvalue %72, %86[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %88 = llvm.insertvalue %74, %87[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %89 = llvm.insertvalue %72, %88[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %90 = llvm.insertvalue %73, %89[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %91 = llvm.extractvalue %90[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %92 = llvm.mlir.constant(4 : index) : i64
          %93 = llvm.mul %1, %92  : i64
          %94 = llvm.mlir.constant(4 : index) : i64
          %95 = llvm.mul %1, %94  : i64
          %96 = llvm.add %93, %95  : i64
          %97 = llvm.add %96, %1  : i64
          %98 = llvm.getelementptr %91[%97] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %99 = llvm.bitcast %98 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %0, %99 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %101 = llvm.extractvalue %90[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %102 = llvm.insertvalue %101, %100[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %103 = llvm.extractvalue %90[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %104 = llvm.insertvalue %103, %102[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %105 = llvm.extractvalue %90[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %106 = llvm.insertvalue %105, %104[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %107 = llvm.mlir.constant(4 : index) : i64
          %108 = llvm.insertvalue %107, %106[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %109 = llvm.mlir.constant(1 : index) : i64
          %110 = llvm.insertvalue %109, %108[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %111 = llvm.extractvalue %110[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %112 = llvm.getelementptr %111[%1] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %113 = llvm.bitcast %112 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %114 = llvm.load %113 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %115 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %116 = llvm.extractvalue %53[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %117 = llvm.bitcast %116 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %118 = llvm.insertvalue %117, %115[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %119 = llvm.extractvalue %53[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %120 = llvm.bitcast %119 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %121 = llvm.insertvalue %120, %118[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %122 = llvm.extractvalue %53[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %123 = llvm.extractvalue %53[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %124 = llvm.extractvalue %53[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %125 = llvm.mul %67, %122  : i64
          %126 = llvm.add %124, %125  : i64
          %127 = llvm.mul %69, %123  : i64
          %128 = llvm.add %126, %127  : i64
          %129 = llvm.insertvalue %128, %121[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %130 = llvm.mlir.constant(32 : i64) : i64
          %131 = llvm.mlir.constant(1 : i64) : i64
          %132 = llvm.insertvalue %130, %129[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %133 = llvm.insertvalue %131, %132[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %134 = llvm.mlir.constant(32 : i64) : i64
          %135 = llvm.mlir.constant(256 : i64) : i64
          %136 = llvm.insertvalue %134, %133[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %137 = llvm.insertvalue %135, %136[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %138 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %139 = llvm.extractvalue %28[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %140 = llvm.bitcast %139 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %141 = llvm.insertvalue %140, %138[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %142 = llvm.extractvalue %28[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %143 = llvm.bitcast %142 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %144 = llvm.insertvalue %143, %141[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %145 = llvm.extractvalue %28[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %146 = llvm.extractvalue %28[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %147 = llvm.extractvalue %28[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %148 = llvm.extractvalue %28[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %149 = llvm.mul %67, %145  : i64
          %150 = llvm.add %148, %149  : i64
          %151 = llvm.mul %69, %146  : i64
          %152 = llvm.add %150, %151  : i64
          %153 = llvm.mlir.constant(0 : i64) : i64
          %154 = llvm.mul %153, %147  : i64
          %155 = llvm.add %152, %154  : i64
          %156 = llvm.insertvalue %155, %144[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %157 = llvm.mlir.constant(128 : i64) : i64
          %158 = llvm.mlir.constant(1 : i64) : i64
          %159 = llvm.insertvalue %157, %156[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %160 = llvm.insertvalue %158, %159[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %161 = llvm.mlir.constant(32 : i64) : i64
          %162 = llvm.mlir.constant(128 : i64) : i64
          %163 = llvm.insertvalue %161, %160[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %164 = llvm.insertvalue %162, %163[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %165 = llvm.mlir.constant(32 : i64) : i64
          %166 = llvm.mlir.constant(32768 : i64) : i64
          %167 = llvm.insertvalue %165, %164[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %168 = llvm.insertvalue %166, %167[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %169 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %170 = llvm.extractvalue %90[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %171 = llvm.insertvalue %170, %169[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %172 = llvm.extractvalue %90[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %173 = llvm.insertvalue %172, %171[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %174 = llvm.extractvalue %90[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %175 = llvm.insertvalue %174, %173[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %176 = llvm.mlir.constant(1 : index) : i64
          %177 = llvm.mlir.constant(4 : index) : i64
          %178 = llvm.insertvalue %176, %175[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %179 = llvm.insertvalue %177, %178[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %180 = llvm.mlir.constant(4 : index) : i64
          %181 = llvm.insertvalue %180, %179[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %182 = llvm.mlir.constant(1 : index) : i64
          %183 = llvm.insertvalue %182, %181[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb1(%1 : i64)
        ^bb1(%184: i64):  // 2 preds: ^bb0, ^bb10
          %185 = llvm.icmp "slt" %184, %6 : i64
          llvm.cond_br %185, ^bb2(%1 : i64), ^bb11
        ^bb2(%186: i64):  // 2 preds: ^bb1, ^bb9
          %187 = llvm.icmp "slt" %186, %6 : i64
          llvm.cond_br %187, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %188 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %189 = llvm.extractvalue %168[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %190 = llvm.bitcast %189 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %191 = llvm.insertvalue %190, %188[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %192 = llvm.extractvalue %168[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %193 = llvm.bitcast %192 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %194 = llvm.insertvalue %193, %191[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %195 = llvm.extractvalue %168[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %196 = llvm.extractvalue %168[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %197 = llvm.extractvalue %168[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %198 = llvm.extractvalue %168[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %199 = llvm.mul %184, %195  : i64
          %200 = llvm.add %198, %199  : i64
          %201 = llvm.mul %186, %196  : i64
          %202 = llvm.add %200, %201  : i64
          %203 = llvm.mlir.constant(0 : i64) : i64
          %204 = llvm.mul %203, %197  : i64
          %205 = llvm.add %202, %204  : i64
          %206 = llvm.insertvalue %205, %194[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %207 = llvm.mlir.constant(128 : i64) : i64
          %208 = llvm.mlir.constant(1 : i64) : i64
          %209 = llvm.insertvalue %207, %206[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %210 = llvm.insertvalue %208, %209[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %211 = llvm.mlir.constant(4 : i64) : i64
          %212 = llvm.mlir.constant(128 : i64) : i64
          %213 = llvm.insertvalue %211, %210[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %214 = llvm.insertvalue %212, %213[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %215 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %216 = llvm.extractvalue %214[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %217 = llvm.insertvalue %216, %215[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %218 = llvm.extractvalue %214[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %219 = llvm.insertvalue %218, %217[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %220 = llvm.extractvalue %214[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %221 = llvm.insertvalue %220, %219[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %222 = llvm.mlir.constant(1 : index) : i64
          %223 = llvm.mlir.constant(4 : index) : i64
          %224 = llvm.mlir.constant(128 : index) : i64
          %225 = llvm.insertvalue %222, %221[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %226 = llvm.insertvalue %223, %225[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %227 = llvm.insertvalue %224, %226[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %228 = llvm.mlir.constant(512 : index) : i64
          %229 = llvm.insertvalue %228, %227[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %230 = llvm.mlir.constant(128 : index) : i64
          %231 = llvm.insertvalue %230, %229[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %232 = llvm.mlir.constant(1 : index) : i64
          %233 = llvm.insertvalue %232, %231[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %234 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %235 = llvm.extractvalue %137[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %236 = llvm.bitcast %235 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %237 = llvm.insertvalue %236, %234[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %238 = llvm.extractvalue %137[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %239 = llvm.bitcast %238 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %240 = llvm.insertvalue %239, %237[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %241 = llvm.extractvalue %137[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %242 = llvm.extractvalue %137[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %243 = llvm.extractvalue %137[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %244 = llvm.mul %184, %241  : i64
          %245 = llvm.add %243, %244  : i64
          %246 = llvm.mul %186, %242  : i64
          %247 = llvm.add %245, %246  : i64
          %248 = llvm.insertvalue %247, %240[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %249 = llvm.mlir.constant(4 : i64) : i64
          %250 = llvm.mlir.constant(1 : i64) : i64
          %251 = llvm.insertvalue %249, %248[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %252 = llvm.insertvalue %250, %251[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %253 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %254 = llvm.extractvalue %252[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %255 = llvm.insertvalue %254, %253[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %256 = llvm.extractvalue %252[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %257 = llvm.insertvalue %256, %255[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %258 = llvm.extractvalue %252[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %259 = llvm.insertvalue %258, %257[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %260 = llvm.mlir.constant(1 : index) : i64
          %261 = llvm.mlir.constant(4 : index) : i64
          %262 = llvm.insertvalue %260, %259[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %263 = llvm.insertvalue %261, %262[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %264 = llvm.mlir.constant(4 : index) : i64
          %265 = llvm.insertvalue %264, %263[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %266 = llvm.mlir.constant(1 : index) : i64
          %267 = llvm.insertvalue %266, %265[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb4(%1 : i64)
        ^bb4(%268: i64):  // 2 preds: ^bb3, ^bb8
          %269 = llvm.icmp "slt" %268, %3 : i64
          llvm.cond_br %269, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %270 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %271 = llvm.extractvalue %233[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %272 = llvm.bitcast %271 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %273 = llvm.insertvalue %272, %270[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %274 = llvm.extractvalue %233[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %275 = llvm.bitcast %274 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %276 = llvm.insertvalue %275, %273[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %277 = llvm.extractvalue %233[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %278 = llvm.extractvalue %233[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %279 = llvm.extractvalue %233[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %280 = llvm.extractvalue %233[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %281 = llvm.mlir.constant(0 : i64) : i64
          %282 = llvm.mul %281, %277  : i64
          %283 = llvm.add %280, %282  : i64
          %284 = llvm.mlir.constant(0 : i64) : i64
          %285 = llvm.mul %284, %278  : i64
          %286 = llvm.add %283, %285  : i64
          %287 = llvm.mul %268, %279  : i64
          %288 = llvm.add %286, %287  : i64
          %289 = llvm.insertvalue %288, %276[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %290 = llvm.mlir.constant(4 : i64) : i64
          %291 = llvm.mlir.constant(1 : i64) : i64
          %292 = llvm.insertvalue %290, %289[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %293 = llvm.insertvalue %291, %292[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %294 = llvm.mlir.constant(4 : i64) : i64
          %295 = llvm.mlir.constant(128 : i64) : i64
          %296 = llvm.insertvalue %294, %293[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %297 = llvm.insertvalue %295, %296[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %298 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %299 = llvm.extractvalue %297[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %300 = llvm.insertvalue %299, %298[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %301 = llvm.extractvalue %297[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %302 = llvm.insertvalue %301, %300[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %303 = llvm.extractvalue %297[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %304 = llvm.insertvalue %303, %302[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %305 = llvm.mlir.constant(1 : index) : i64
          %306 = llvm.mlir.constant(4 : index) : i64
          %307 = llvm.mlir.constant(4 : index) : i64
          %308 = llvm.insertvalue %305, %304[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %309 = llvm.insertvalue %306, %308[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %310 = llvm.insertvalue %307, %309[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %311 = llvm.mlir.constant(512 : index) : i64
          %312 = llvm.insertvalue %311, %310[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %313 = llvm.mlir.constant(128 : index) : i64
          %314 = llvm.insertvalue %313, %312[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %315 = llvm.mlir.constant(1 : index) : i64
          %316 = llvm.insertvalue %315, %314[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          llvm.br ^bb6(%1 : i64)
        ^bb6(%317: i64):  // 2 preds: ^bb5, ^bb7
          %318 = llvm.icmp "slt" %317, %4 : i64
          llvm.cond_br %318, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %319 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %320 = llvm.extractvalue %316[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %321 = llvm.bitcast %320 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %322 = llvm.insertvalue %321, %319[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %323 = llvm.extractvalue %316[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %324 = llvm.bitcast %323 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %325 = llvm.insertvalue %324, %322[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %326 = llvm.extractvalue %316[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %327 = llvm.extractvalue %316[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %328 = llvm.extractvalue %316[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %329 = llvm.extractvalue %316[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %330 = llvm.mlir.constant(0 : i64) : i64
          %331 = llvm.mul %330, %326  : i64
          %332 = llvm.add %329, %331  : i64
          %333 = llvm.mul %317, %327  : i64
          %334 = llvm.add %332, %333  : i64
          %335 = llvm.mlir.constant(0 : i64) : i64
          %336 = llvm.mul %335, %328  : i64
          %337 = llvm.add %334, %336  : i64
          %338 = llvm.insertvalue %337, %325[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %339 = llvm.mlir.constant(4 : i64) : i64
          %340 = llvm.mlir.constant(1 : i64) : i64
          %341 = llvm.insertvalue %339, %338[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %342 = llvm.insertvalue %340, %341[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %343 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %344 = llvm.extractvalue %267[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %345 = llvm.bitcast %344 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %346 = llvm.insertvalue %345, %343[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %347 = llvm.extractvalue %267[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %348 = llvm.bitcast %347 : !llvm.ptr<i32> to !llvm.ptr<i32>
          %349 = llvm.insertvalue %348, %346[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %350 = llvm.extractvalue %267[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %351 = llvm.extractvalue %267[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %352 = llvm.extractvalue %267[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %353 = llvm.mlir.constant(0 : i64) : i64
          %354 = llvm.mul %353, %350  : i64
          %355 = llvm.add %352, %354  : i64
          %356 = llvm.mul %317, %351  : i64
          %357 = llvm.add %355, %356  : i64
          %358 = llvm.insertvalue %357, %349[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %359 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %360 = llvm.extractvalue %342[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %361 = llvm.insertvalue %360, %359[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %362 = llvm.extractvalue %342[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %363 = llvm.insertvalue %362, %361[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %364 = llvm.extractvalue %342[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %365 = llvm.insertvalue %364, %363[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %366 = llvm.mlir.constant(1 : index) : i64
          %367 = llvm.mlir.constant(1 : index) : i64
          %368 = llvm.mlir.constant(1 : index) : i64
          %369 = llvm.mlir.constant(4 : index) : i64
          %370 = llvm.insertvalue %366, %365[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %371 = llvm.insertvalue %367, %370[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %372 = llvm.insertvalue %368, %371[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %373 = llvm.insertvalue %369, %372[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %374 = llvm.mlir.constant(4 : index) : i64
          %375 = llvm.insertvalue %374, %373[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %376 = llvm.mlir.constant(4 : index) : i64
          %377 = llvm.insertvalue %376, %375[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %378 = llvm.mlir.constant(4 : index) : i64
          %379 = llvm.insertvalue %378, %377[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %380 = llvm.mlir.constant(1 : index) : i64
          %381 = llvm.insertvalue %380, %379[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %382 = llvm.extractvalue %381[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %383 = llvm.extractvalue %381[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %384 = llvm.mlir.constant(4 : index) : i64
          %385 = llvm.mul %1, %384  : i64
          %386 = llvm.add %383, %385  : i64
          %387 = llvm.mlir.constant(4 : index) : i64
          %388 = llvm.mul %1, %387  : i64
          %389 = llvm.add %386, %388  : i64
          %390 = llvm.mlir.constant(4 : index) : i64
          %391 = llvm.mul %1, %390  : i64
          %392 = llvm.add %389, %391  : i64
          %393 = llvm.add %392, %1  : i64
          %394 = llvm.getelementptr %382[%393] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %395 = llvm.bitcast %394 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %396 = llvm.load %395 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %397 = llvm.add %396, %114  : vector<4xi32>
          %398 = llvm.extractvalue %110[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %399 = llvm.getelementptr %398[%1] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %400 = llvm.bitcast %399 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %397, %400 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %401 = llvm.extractvalue %183[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %402 = llvm.mlir.constant(4 : index) : i64
          %403 = llvm.mul %1, %402  : i64
          %404 = llvm.add %403, %1  : i64
          %405 = llvm.getelementptr %401[%404] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %406 = llvm.bitcast %405 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %407 = llvm.load %406 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %408 = llvm.extractvalue %358[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %409 = llvm.extractvalue %358[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %410 = llvm.getelementptr %408[%409] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %411 = llvm.load %410 : !llvm.ptr<i32>
          %412 = llvm.mlir.undef : vector<1xi32>
          %413 = llvm.mlir.constant(0 : i32) : i32
          %414 = llvm.insertelement %411, %412[%413 : i32] : vector<1xi32>
          %415 = llvm.mlir.constant(0 : index) : i64
          %416 = llvm.extractelement %414[%415 : i64] : vector<1xi32>
          %417 = "llvm.intr.vector.reduce.add"(%407) : (vector<4xi32>) -> i32
          %418 = llvm.add %416, %417  : i32
          %419 = llvm.insertelement %418, %2[%1 : i64] : vector<1xi32>
          %420 = llvm.mlir.constant(0 : i64) : i64
          %421 = llvm.extractelement %419[%420 : i64] : vector<1xi32>
          %422 = llvm.mlir.undef : vector<1xi32>
          %423 = llvm.mlir.constant(0 : i32) : i32
          %424 = llvm.insertelement %421, %422[%423 : i32] : vector<1xi32>
          %425 = llvm.mlir.constant(0 : index) : i64
          %426 = llvm.extractelement %424[%425 : i64] : vector<1xi32>
          %427 = llvm.extractvalue %358[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %428 = llvm.extractvalue %358[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %429 = llvm.getelementptr %427[%428] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          llvm.store %426, %429 : !llvm.ptr<i32>
          %430 = llvm.add %317, %5  : i64
          llvm.br ^bb6(%430 : i64)
        ^bb8:  // pred: ^bb6
          %431 = llvm.add %268, %4  : i64
          llvm.br ^bb4(%431 : i64)
        ^bb9:  // pred: ^bb4
          %432 = llvm.add %186, %4  : i64
          llvm.br ^bb2(%432 : i64)
        ^bb10:  // pred: ^bb2
          %433 = llvm.add %184, %5  : i64
          llvm.br ^bb1(%433 : i64)
        ^bb11:  // pred: ^bb1
          %434 = llvm.mlir.constant(0 : i32) : i32
          llvm.return %434 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After ReconcileUnrealizedCasts (reconcile-unrealized-casts) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
          %0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %1 = llvm.mlir.constant(0 : index) : i64
          %2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %3 = llvm.mlir.constant(128 : index) : i64
          %4 = llvm.mlir.constant(4 : index) : i64
          %5 = llvm.mlir.constant(1 : index) : i64
          %6 = llvm.mlir.constant(32 : index) : i64
          %7 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %8 = llvm.extractvalue %7[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.load %8 : !llvm.ptr<ptr<i8>>
          %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %12 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %17 = llvm.mlir.constant(512 : index) : i64
          %18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %19 = llvm.mlir.constant(32768 : index) : i64
          %20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %21 = llvm.mlir.constant(256 : index) : i64
          %22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %23 = llvm.mlir.constant(128 : index) : i64
          %24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %25 = llvm.mlir.constant(128 : index) : i64
          %26 = llvm.insertvalue %25, %24[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %27 = llvm.mlir.constant(1 : index) : i64
          %28 = llvm.insertvalue %27, %26[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %29 = llvm.mlir.constant(0 : index) : i64
          %30 = llvm.mlir.constant(63 : index) : i64
          %31 = llvm.ptrtoint %11 : !llvm.ptr<i32> to i64
          %32 = llvm.and %31, %30  : i64
          %33 = llvm.icmp "eq" %32, %29 : i64
          "llvm.intr.assume"(%33) : (i1) -> ()
          %34 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %35 = llvm.extractvalue %34[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %36 = llvm.mlir.constant(1 : i64) : i64
          %37 = llvm.getelementptr %35[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %38 = llvm.load %37 : !llvm.ptr<ptr<i8>>
          %39 = llvm.bitcast %38 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %40 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %41 = llvm.insertvalue %39, %40[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %42 = llvm.insertvalue %39, %41[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %43 = llvm.mlir.constant(0 : index) : i64
          %44 = llvm.insertvalue %43, %42[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %45 = llvm.mlir.constant(512 : index) : i64
          %46 = llvm.insertvalue %45, %44[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %47 = llvm.mlir.constant(256 : index) : i64
          %48 = llvm.insertvalue %47, %46[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %49 = llvm.mlir.constant(256 : index) : i64
          %50 = llvm.insertvalue %49, %48[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %51 = llvm.mlir.constant(1 : index) : i64
          %52 = llvm.insertvalue %51, %50[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %53 = llvm.mlir.constant(0 : index) : i64
          %54 = llvm.mlir.constant(63 : index) : i64
          %55 = llvm.ptrtoint %39 : !llvm.ptr<i32> to i64
          %56 = llvm.and %55, %54  : i64
          %57 = llvm.icmp "eq" %56, %53 : i64
          "llvm.intr.assume"(%57) : (i1) -> ()
          %58 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %59 = llvm.extractvalue %58[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %60 = llvm.zext %59 : i32 to i64
          %61 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %62 = llvm.extractvalue %61[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %63 = llvm.zext %62 : i32 to i64
          %64 = llvm.mlir.constant(32 : index) : i64
          %65 = llvm.mul %63, %64  : i64
          %66 = llvm.mlir.constant(32 : index) : i64
          %67 = llvm.mul %60, %66  : i64
          %68 = llvm.mlir.constant(1 : index) : i64
          %69 = llvm.mlir.constant(1 : index) : i64
          %70 = llvm.mlir.constant(4 : index) : i64
          %71 = llvm.mlir.constant(1 : index) : i64
          %72 = llvm.mlir.constant(4 : index) : i64
          %73 = llvm.mlir.constant(4 : index) : i64
          %74 = llvm.mlir.null : !llvm.ptr<i32>
          %75 = llvm.getelementptr %74[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %76 = llvm.ptrtoint %75 : !llvm.ptr<i32> to i64
          %77 = llvm.alloca %76 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %78 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %79 = llvm.insertvalue %77, %78[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %80 = llvm.insertvalue %77, %79[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %81 = llvm.mlir.constant(0 : index) : i64
          %82 = llvm.insertvalue %81, %80[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %83 = llvm.insertvalue %68, %82[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %84 = llvm.insertvalue %69, %83[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %85 = llvm.insertvalue %70, %84[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %86 = llvm.insertvalue %72, %85[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %87 = llvm.insertvalue %70, %86[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %88 = llvm.insertvalue %71, %87[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %89 = llvm.mlir.constant(4 : index) : i64
          %90 = llvm.mul %1, %89  : i64
          %91 = llvm.mlir.constant(4 : index) : i64
          %92 = llvm.mul %1, %91  : i64
          %93 = llvm.add %90, %92  : i64
          %94 = llvm.add %93, %1  : i64
          %95 = llvm.getelementptr %77[%94] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %96 = llvm.bitcast %95 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %0, %96 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %97 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %98 = llvm.insertvalue %77, %97[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %99 = llvm.insertvalue %77, %98[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %100 = llvm.insertvalue %81, %99[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %101 = llvm.mlir.constant(4 : index) : i64
          %102 = llvm.insertvalue %101, %100[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %103 = llvm.mlir.constant(1 : index) : i64
          %104 = llvm.insertvalue %103, %102[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %105 = llvm.bitcast %77 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %106 = llvm.load %105 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %107 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %108 = llvm.insertvalue %39, %107[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %109 = llvm.insertvalue %39, %108[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %110 = llvm.mul %65, %47  : i64
          %111 = llvm.add %43, %110  : i64
          %112 = llvm.mul %67, %51  : i64
          %113 = llvm.add %111, %112  : i64
          %114 = llvm.insertvalue %113, %109[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %115 = llvm.mlir.constant(32 : i64) : i64
          %116 = llvm.mlir.constant(1 : i64) : i64
          %117 = llvm.insertvalue %115, %114[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %118 = llvm.insertvalue %116, %117[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %119 = llvm.mlir.constant(32 : i64) : i64
          %120 = llvm.mlir.constant(256 : i64) : i64
          %121 = llvm.insertvalue %119, %118[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %123 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %124 = llvm.insertvalue %11, %123[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %125 = llvm.insertvalue %11, %124[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %126 = llvm.mul %65, %19  : i64
          %127 = llvm.add %15, %126  : i64
          %128 = llvm.mul %67, %23  : i64
          %129 = llvm.add %127, %128  : i64
          %130 = llvm.mlir.constant(0 : i64) : i64
          %131 = llvm.mul %130, %27  : i64
          %132 = llvm.add %129, %131  : i64
          %133 = llvm.insertvalue %132, %125[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %134 = llvm.mlir.constant(128 : i64) : i64
          %135 = llvm.mlir.constant(1 : i64) : i64
          %136 = llvm.insertvalue %134, %133[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %137 = llvm.insertvalue %135, %136[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %138 = llvm.mlir.constant(32 : i64) : i64
          %139 = llvm.mlir.constant(128 : i64) : i64
          %140 = llvm.insertvalue %138, %137[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %141 = llvm.insertvalue %139, %140[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %142 = llvm.mlir.constant(32 : i64) : i64
          %143 = llvm.mlir.constant(32768 : i64) : i64
          %144 = llvm.insertvalue %142, %141[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %145 = llvm.insertvalue %143, %144[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %146 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %147 = llvm.insertvalue %77, %146[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %148 = llvm.insertvalue %77, %147[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %149 = llvm.insertvalue %81, %148[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %150 = llvm.mlir.constant(1 : index) : i64
          %151 = llvm.mlir.constant(4 : index) : i64
          %152 = llvm.insertvalue %150, %149[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %153 = llvm.insertvalue %151, %152[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %154 = llvm.mlir.constant(4 : index) : i64
          %155 = llvm.insertvalue %154, %153[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %156 = llvm.mlir.constant(1 : index) : i64
          %157 = llvm.insertvalue %156, %155[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb1(%1 : i64)
        ^bb1(%158: i64):  // 2 preds: ^bb0, ^bb10
          %159 = llvm.icmp "slt" %158, %6 : i64
          llvm.cond_br %159, ^bb2(%1 : i64), ^bb11
        ^bb2(%160: i64):  // 2 preds: ^bb1, ^bb9
          %161 = llvm.icmp "slt" %160, %6 : i64
          llvm.cond_br %161, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %162 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %163 = llvm.insertvalue %11, %162[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %164 = llvm.insertvalue %11, %163[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %165 = llvm.mul %158, %143  : i64
          %166 = llvm.add %132, %165  : i64
          %167 = llvm.mul %160, %139  : i64
          %168 = llvm.add %166, %167  : i64
          %169 = llvm.mlir.constant(0 : i64) : i64
          %170 = llvm.mul %169, %135  : i64
          %171 = llvm.add %168, %170  : i64
          %172 = llvm.insertvalue %171, %164[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %173 = llvm.mlir.constant(128 : i64) : i64
          %174 = llvm.mlir.constant(1 : i64) : i64
          %175 = llvm.insertvalue %173, %172[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %176 = llvm.insertvalue %174, %175[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %177 = llvm.mlir.constant(4 : i64) : i64
          %178 = llvm.mlir.constant(128 : i64) : i64
          %179 = llvm.insertvalue %177, %176[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %180 = llvm.insertvalue %178, %179[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %181 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %182 = llvm.insertvalue %11, %181[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %183 = llvm.insertvalue %11, %182[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %184 = llvm.insertvalue %171, %183[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %185 = llvm.mlir.constant(1 : index) : i64
          %186 = llvm.mlir.constant(4 : index) : i64
          %187 = llvm.mlir.constant(128 : index) : i64
          %188 = llvm.insertvalue %185, %184[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %189 = llvm.insertvalue %186, %188[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %190 = llvm.insertvalue %187, %189[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %191 = llvm.mlir.constant(512 : index) : i64
          %192 = llvm.insertvalue %191, %190[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %193 = llvm.mlir.constant(128 : index) : i64
          %194 = llvm.insertvalue %193, %192[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %195 = llvm.mlir.constant(1 : index) : i64
          %196 = llvm.insertvalue %195, %194[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %197 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %198 = llvm.insertvalue %39, %197[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %199 = llvm.insertvalue %39, %198[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %200 = llvm.mul %158, %120  : i64
          %201 = llvm.add %113, %200  : i64
          %202 = llvm.mul %160, %116  : i64
          %203 = llvm.add %201, %202  : i64
          %204 = llvm.insertvalue %203, %199[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %205 = llvm.mlir.constant(4 : i64) : i64
          %206 = llvm.mlir.constant(1 : i64) : i64
          %207 = llvm.insertvalue %205, %204[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %208 = llvm.insertvalue %206, %207[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %209 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %210 = llvm.insertvalue %39, %209[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %211 = llvm.insertvalue %39, %210[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %212 = llvm.insertvalue %203, %211[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %213 = llvm.mlir.constant(1 : index) : i64
          %214 = llvm.mlir.constant(4 : index) : i64
          %215 = llvm.insertvalue %213, %212[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %216 = llvm.insertvalue %214, %215[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %217 = llvm.mlir.constant(4 : index) : i64
          %218 = llvm.insertvalue %217, %216[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %219 = llvm.mlir.constant(1 : index) : i64
          %220 = llvm.insertvalue %219, %218[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb4(%1 : i64)
        ^bb4(%221: i64):  // 2 preds: ^bb3, ^bb8
          %222 = llvm.icmp "slt" %221, %3 : i64
          llvm.cond_br %222, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %223 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %224 = llvm.insertvalue %11, %223[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %225 = llvm.insertvalue %11, %224[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %226 = llvm.mlir.constant(0 : i64) : i64
          %227 = llvm.mul %226, %191  : i64
          %228 = llvm.add %171, %227  : i64
          %229 = llvm.mlir.constant(0 : i64) : i64
          %230 = llvm.mul %229, %193  : i64
          %231 = llvm.add %228, %230  : i64
          %232 = llvm.mul %221, %195  : i64
          %233 = llvm.add %231, %232  : i64
          %234 = llvm.insertvalue %233, %225[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %235 = llvm.mlir.constant(4 : i64) : i64
          %236 = llvm.mlir.constant(1 : i64) : i64
          %237 = llvm.insertvalue %235, %234[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %238 = llvm.insertvalue %236, %237[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %239 = llvm.mlir.constant(4 : i64) : i64
          %240 = llvm.mlir.constant(128 : i64) : i64
          %241 = llvm.insertvalue %239, %238[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %242 = llvm.insertvalue %240, %241[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %243 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %244 = llvm.insertvalue %11, %243[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %245 = llvm.insertvalue %11, %244[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %246 = llvm.insertvalue %233, %245[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %247 = llvm.mlir.constant(1 : index) : i64
          %248 = llvm.mlir.constant(4 : index) : i64
          %249 = llvm.mlir.constant(4 : index) : i64
          %250 = llvm.insertvalue %247, %246[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %251 = llvm.insertvalue %248, %250[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %252 = llvm.insertvalue %249, %251[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %253 = llvm.mlir.constant(512 : index) : i64
          %254 = llvm.insertvalue %253, %252[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %255 = llvm.mlir.constant(128 : index) : i64
          %256 = llvm.insertvalue %255, %254[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %257 = llvm.mlir.constant(1 : index) : i64
          %258 = llvm.insertvalue %257, %256[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          llvm.br ^bb6(%1 : i64)
        ^bb6(%259: i64):  // 2 preds: ^bb5, ^bb7
          %260 = llvm.icmp "slt" %259, %4 : i64
          llvm.cond_br %260, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %261 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %262 = llvm.insertvalue %11, %261[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %263 = llvm.insertvalue %11, %262[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %264 = llvm.mlir.constant(0 : i64) : i64
          %265 = llvm.mul %264, %253  : i64
          %266 = llvm.add %233, %265  : i64
          %267 = llvm.mul %259, %255  : i64
          %268 = llvm.add %266, %267  : i64
          %269 = llvm.mlir.constant(0 : i64) : i64
          %270 = llvm.mul %269, %257  : i64
          %271 = llvm.add %268, %270  : i64
          %272 = llvm.insertvalue %271, %263[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %273 = llvm.mlir.constant(4 : i64) : i64
          %274 = llvm.mlir.constant(1 : i64) : i64
          %275 = llvm.insertvalue %273, %272[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %276 = llvm.insertvalue %274, %275[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %277 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %278 = llvm.insertvalue %39, %277[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %279 = llvm.insertvalue %39, %278[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %280 = llvm.mlir.constant(0 : i64) : i64
          %281 = llvm.mul %280, %217  : i64
          %282 = llvm.add %203, %281  : i64
          %283 = llvm.mul %259, %219  : i64
          %284 = llvm.add %282, %283  : i64
          %285 = llvm.insertvalue %284, %279[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %286 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %287 = llvm.insertvalue %11, %286[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %288 = llvm.insertvalue %11, %287[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %289 = llvm.insertvalue %271, %288[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %290 = llvm.mlir.constant(1 : index) : i64
          %291 = llvm.mlir.constant(1 : index) : i64
          %292 = llvm.mlir.constant(1 : index) : i64
          %293 = llvm.mlir.constant(4 : index) : i64
          %294 = llvm.insertvalue %290, %289[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %295 = llvm.insertvalue %291, %294[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %296 = llvm.insertvalue %292, %295[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %297 = llvm.insertvalue %293, %296[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %298 = llvm.mlir.constant(4 : index) : i64
          %299 = llvm.insertvalue %298, %297[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %300 = llvm.mlir.constant(4 : index) : i64
          %301 = llvm.insertvalue %300, %299[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %302 = llvm.mlir.constant(4 : index) : i64
          %303 = llvm.insertvalue %302, %301[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %304 = llvm.mlir.constant(1 : index) : i64
          %305 = llvm.insertvalue %304, %303[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %306 = llvm.mlir.constant(4 : index) : i64
          %307 = llvm.mul %1, %306  : i64
          %308 = llvm.add %271, %307  : i64
          %309 = llvm.mlir.constant(4 : index) : i64
          %310 = llvm.mul %1, %309  : i64
          %311 = llvm.add %308, %310  : i64
          %312 = llvm.mlir.constant(4 : index) : i64
          %313 = llvm.mul %1, %312  : i64
          %314 = llvm.add %311, %313  : i64
          %315 = llvm.add %314, %1  : i64
          %316 = llvm.getelementptr %11[%315] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %317 = llvm.bitcast %316 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %318 = llvm.load %317 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %319 = llvm.add %318, %106  : vector<4xi32>
          %320 = llvm.bitcast %77 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %319, %320 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %321 = llvm.mlir.constant(4 : index) : i64
          %322 = llvm.mul %1, %321  : i64
          %323 = llvm.add %322, %1  : i64
          %324 = llvm.getelementptr %77[%323] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %325 = llvm.bitcast %324 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %326 = llvm.load %325 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %327 = llvm.getelementptr %39[%284] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %328 = llvm.load %327 : !llvm.ptr<i32>
          %329 = llvm.mlir.undef : vector<1xi32>
          %330 = llvm.mlir.constant(0 : i32) : i32
          %331 = llvm.insertelement %328, %329[%330 : i32] : vector<1xi32>
          %332 = llvm.mlir.constant(0 : index) : i64
          %333 = llvm.extractelement %331[%332 : i64] : vector<1xi32>
          %334 = "llvm.intr.vector.reduce.add"(%326) : (vector<4xi32>) -> i32
          %335 = llvm.add %333, %334  : i32
          %336 = llvm.insertelement %335, %2[%1 : i64] : vector<1xi32>
          %337 = llvm.mlir.constant(0 : i64) : i64
          %338 = llvm.extractelement %336[%337 : i64] : vector<1xi32>
          %339 = llvm.mlir.undef : vector<1xi32>
          %340 = llvm.mlir.constant(0 : i32) : i32
          %341 = llvm.insertelement %338, %339[%340 : i32] : vector<1xi32>
          %342 = llvm.mlir.constant(0 : index) : i64
          %343 = llvm.extractelement %341[%342 : i64] : vector<1xi32>
          %344 = llvm.getelementptr %39[%284] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          llvm.store %343, %344 : !llvm.ptr<i32>
          %345 = llvm.add %259, %5  : i64
          llvm.br ^bb6(%345 : i64)
        ^bb8:  // pred: ^bb6
          %346 = llvm.add %221, %4  : i64
          llvm.br ^bb4(%346 : i64)
        ^bb9:  // pred: ^bb4
          %347 = llvm.add %160, %4  : i64
          llvm.br ^bb2(%347 : i64)
        ^bb10:  // pred: ^bb2
          %348 = llvm.add %158, %5  : i64
          llvm.br ^bb1(%348 : i64)
        ^bb11:  // pred: ^bb1
          %349 = llvm.mlir.constant(0 : i32) : i32
          llvm.return %349 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility (iree-llvmcpu-synchronize-symbol-visibility) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %1 = llvm.mlir.constant(0 : index) : i64
          %2 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %3 = llvm.mlir.constant(128 : index) : i64
          %4 = llvm.mlir.constant(4 : index) : i64
          %5 = llvm.mlir.constant(1 : index) : i64
          %6 = llvm.mlir.constant(32 : index) : i64
          %7 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %8 = llvm.extractvalue %7[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.load %8 : !llvm.ptr<ptr<i8>>
          %11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %12 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %15 = llvm.mlir.constant(0 : index) : i64
          %16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %17 = llvm.mlir.constant(512 : index) : i64
          %18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %19 = llvm.mlir.constant(32768 : index) : i64
          %20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %21 = llvm.mlir.constant(256 : index) : i64
          %22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %23 = llvm.mlir.constant(128 : index) : i64
          %24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %25 = llvm.mlir.constant(128 : index) : i64
          %26 = llvm.insertvalue %25, %24[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %27 = llvm.mlir.constant(1 : index) : i64
          %28 = llvm.insertvalue %27, %26[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %29 = llvm.mlir.constant(0 : index) : i64
          %30 = llvm.mlir.constant(63 : index) : i64
          %31 = llvm.ptrtoint %11 : !llvm.ptr<i32> to i64
          %32 = llvm.and %31, %30  : i64
          %33 = llvm.icmp "eq" %32, %29 : i64
          "llvm.intr.assume"(%33) : (i1) -> ()
          %34 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %35 = llvm.extractvalue %34[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %36 = llvm.mlir.constant(1 : i64) : i64
          %37 = llvm.getelementptr %35[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %38 = llvm.load %37 : !llvm.ptr<ptr<i8>>
          %39 = llvm.bitcast %38 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %40 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %41 = llvm.insertvalue %39, %40[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %42 = llvm.insertvalue %39, %41[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %43 = llvm.mlir.constant(0 : index) : i64
          %44 = llvm.insertvalue %43, %42[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %45 = llvm.mlir.constant(512 : index) : i64
          %46 = llvm.insertvalue %45, %44[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %47 = llvm.mlir.constant(256 : index) : i64
          %48 = llvm.insertvalue %47, %46[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %49 = llvm.mlir.constant(256 : index) : i64
          %50 = llvm.insertvalue %49, %48[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %51 = llvm.mlir.constant(1 : index) : i64
          %52 = llvm.insertvalue %51, %50[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %53 = llvm.mlir.constant(0 : index) : i64
          %54 = llvm.mlir.constant(63 : index) : i64
          %55 = llvm.ptrtoint %39 : !llvm.ptr<i32> to i64
          %56 = llvm.and %55, %54  : i64
          %57 = llvm.icmp "eq" %56, %53 : i64
          "llvm.intr.assume"(%57) : (i1) -> ()
          %58 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %59 = llvm.extractvalue %58[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %60 = llvm.zext %59 : i32 to i64
          %61 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %62 = llvm.extractvalue %61[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %63 = llvm.zext %62 : i32 to i64
          %64 = llvm.mlir.constant(32 : index) : i64
          %65 = llvm.mul %63, %64  : i64
          %66 = llvm.mlir.constant(32 : index) : i64
          %67 = llvm.mul %60, %66  : i64
          %68 = llvm.mlir.constant(1 : index) : i64
          %69 = llvm.mlir.constant(1 : index) : i64
          %70 = llvm.mlir.constant(4 : index) : i64
          %71 = llvm.mlir.constant(1 : index) : i64
          %72 = llvm.mlir.constant(4 : index) : i64
          %73 = llvm.mlir.constant(4 : index) : i64
          %74 = llvm.mlir.null : !llvm.ptr<i32>
          %75 = llvm.getelementptr %74[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %76 = llvm.ptrtoint %75 : !llvm.ptr<i32> to i64
          %77 = llvm.alloca %76 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %78 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %79 = llvm.insertvalue %77, %78[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %80 = llvm.insertvalue %77, %79[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %81 = llvm.mlir.constant(0 : index) : i64
          %82 = llvm.insertvalue %81, %80[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %83 = llvm.insertvalue %68, %82[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %84 = llvm.insertvalue %69, %83[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %85 = llvm.insertvalue %70, %84[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %86 = llvm.insertvalue %72, %85[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %87 = llvm.insertvalue %70, %86[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %88 = llvm.insertvalue %71, %87[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %89 = llvm.mlir.constant(4 : index) : i64
          %90 = llvm.mul %1, %89  : i64
          %91 = llvm.mlir.constant(4 : index) : i64
          %92 = llvm.mul %1, %91  : i64
          %93 = llvm.add %90, %92  : i64
          %94 = llvm.add %93, %1  : i64
          %95 = llvm.getelementptr %77[%94] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %96 = llvm.bitcast %95 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %0, %96 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %97 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %98 = llvm.insertvalue %77, %97[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %99 = llvm.insertvalue %77, %98[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %100 = llvm.insertvalue %81, %99[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %101 = llvm.mlir.constant(4 : index) : i64
          %102 = llvm.insertvalue %101, %100[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %103 = llvm.mlir.constant(1 : index) : i64
          %104 = llvm.insertvalue %103, %102[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %105 = llvm.bitcast %77 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %106 = llvm.load %105 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %107 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %108 = llvm.insertvalue %39, %107[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %109 = llvm.insertvalue %39, %108[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %110 = llvm.mul %65, %47  : i64
          %111 = llvm.add %43, %110  : i64
          %112 = llvm.mul %67, %51  : i64
          %113 = llvm.add %111, %112  : i64
          %114 = llvm.insertvalue %113, %109[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %115 = llvm.mlir.constant(32 : i64) : i64
          %116 = llvm.mlir.constant(1 : i64) : i64
          %117 = llvm.insertvalue %115, %114[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %118 = llvm.insertvalue %116, %117[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %119 = llvm.mlir.constant(32 : i64) : i64
          %120 = llvm.mlir.constant(256 : i64) : i64
          %121 = llvm.insertvalue %119, %118[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %123 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %124 = llvm.insertvalue %11, %123[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %125 = llvm.insertvalue %11, %124[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %126 = llvm.mul %65, %19  : i64
          %127 = llvm.add %15, %126  : i64
          %128 = llvm.mul %67, %23  : i64
          %129 = llvm.add %127, %128  : i64
          %130 = llvm.mlir.constant(0 : i64) : i64
          %131 = llvm.mul %130, %27  : i64
          %132 = llvm.add %129, %131  : i64
          %133 = llvm.insertvalue %132, %125[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %134 = llvm.mlir.constant(128 : i64) : i64
          %135 = llvm.mlir.constant(1 : i64) : i64
          %136 = llvm.insertvalue %134, %133[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %137 = llvm.insertvalue %135, %136[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %138 = llvm.mlir.constant(32 : i64) : i64
          %139 = llvm.mlir.constant(128 : i64) : i64
          %140 = llvm.insertvalue %138, %137[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %141 = llvm.insertvalue %139, %140[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %142 = llvm.mlir.constant(32 : i64) : i64
          %143 = llvm.mlir.constant(32768 : i64) : i64
          %144 = llvm.insertvalue %142, %141[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %145 = llvm.insertvalue %143, %144[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %146 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %147 = llvm.insertvalue %77, %146[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %148 = llvm.insertvalue %77, %147[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %149 = llvm.insertvalue %81, %148[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %150 = llvm.mlir.constant(1 : index) : i64
          %151 = llvm.mlir.constant(4 : index) : i64
          %152 = llvm.insertvalue %150, %149[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %153 = llvm.insertvalue %151, %152[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %154 = llvm.mlir.constant(4 : index) : i64
          %155 = llvm.insertvalue %154, %153[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %156 = llvm.mlir.constant(1 : index) : i64
          %157 = llvm.insertvalue %156, %155[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb1(%1 : i64)
        ^bb1(%158: i64):  // 2 preds: ^bb0, ^bb10
          %159 = llvm.icmp "slt" %158, %6 : i64
          llvm.cond_br %159, ^bb2(%1 : i64), ^bb11
        ^bb2(%160: i64):  // 2 preds: ^bb1, ^bb9
          %161 = llvm.icmp "slt" %160, %6 : i64
          llvm.cond_br %161, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %162 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %163 = llvm.insertvalue %11, %162[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %164 = llvm.insertvalue %11, %163[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %165 = llvm.mul %158, %143  : i64
          %166 = llvm.add %132, %165  : i64
          %167 = llvm.mul %160, %139  : i64
          %168 = llvm.add %166, %167  : i64
          %169 = llvm.mlir.constant(0 : i64) : i64
          %170 = llvm.mul %169, %135  : i64
          %171 = llvm.add %168, %170  : i64
          %172 = llvm.insertvalue %171, %164[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %173 = llvm.mlir.constant(128 : i64) : i64
          %174 = llvm.mlir.constant(1 : i64) : i64
          %175 = llvm.insertvalue %173, %172[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %176 = llvm.insertvalue %174, %175[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %177 = llvm.mlir.constant(4 : i64) : i64
          %178 = llvm.mlir.constant(128 : i64) : i64
          %179 = llvm.insertvalue %177, %176[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %180 = llvm.insertvalue %178, %179[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %181 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %182 = llvm.insertvalue %11, %181[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %183 = llvm.insertvalue %11, %182[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %184 = llvm.insertvalue %171, %183[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %185 = llvm.mlir.constant(1 : index) : i64
          %186 = llvm.mlir.constant(4 : index) : i64
          %187 = llvm.mlir.constant(128 : index) : i64
          %188 = llvm.insertvalue %185, %184[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %189 = llvm.insertvalue %186, %188[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %190 = llvm.insertvalue %187, %189[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %191 = llvm.mlir.constant(512 : index) : i64
          %192 = llvm.insertvalue %191, %190[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %193 = llvm.mlir.constant(128 : index) : i64
          %194 = llvm.insertvalue %193, %192[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %195 = llvm.mlir.constant(1 : index) : i64
          %196 = llvm.insertvalue %195, %194[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %197 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %198 = llvm.insertvalue %39, %197[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %199 = llvm.insertvalue %39, %198[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %200 = llvm.mul %158, %120  : i64
          %201 = llvm.add %113, %200  : i64
          %202 = llvm.mul %160, %116  : i64
          %203 = llvm.add %201, %202  : i64
          %204 = llvm.insertvalue %203, %199[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %205 = llvm.mlir.constant(4 : i64) : i64
          %206 = llvm.mlir.constant(1 : i64) : i64
          %207 = llvm.insertvalue %205, %204[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %208 = llvm.insertvalue %206, %207[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %209 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %210 = llvm.insertvalue %39, %209[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %211 = llvm.insertvalue %39, %210[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %212 = llvm.insertvalue %203, %211[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %213 = llvm.mlir.constant(1 : index) : i64
          %214 = llvm.mlir.constant(4 : index) : i64
          %215 = llvm.insertvalue %213, %212[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %216 = llvm.insertvalue %214, %215[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %217 = llvm.mlir.constant(4 : index) : i64
          %218 = llvm.insertvalue %217, %216[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %219 = llvm.mlir.constant(1 : index) : i64
          %220 = llvm.insertvalue %219, %218[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          llvm.br ^bb4(%1 : i64)
        ^bb4(%221: i64):  // 2 preds: ^bb3, ^bb8
          %222 = llvm.icmp "slt" %221, %3 : i64
          llvm.cond_br %222, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %223 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %224 = llvm.insertvalue %11, %223[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %225 = llvm.insertvalue %11, %224[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %226 = llvm.mlir.constant(0 : i64) : i64
          %227 = llvm.mul %226, %191  : i64
          %228 = llvm.add %171, %227  : i64
          %229 = llvm.mlir.constant(0 : i64) : i64
          %230 = llvm.mul %229, %193  : i64
          %231 = llvm.add %228, %230  : i64
          %232 = llvm.mul %221, %195  : i64
          %233 = llvm.add %231, %232  : i64
          %234 = llvm.insertvalue %233, %225[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %235 = llvm.mlir.constant(4 : i64) : i64
          %236 = llvm.mlir.constant(1 : i64) : i64
          %237 = llvm.insertvalue %235, %234[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %238 = llvm.insertvalue %236, %237[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %239 = llvm.mlir.constant(4 : i64) : i64
          %240 = llvm.mlir.constant(128 : i64) : i64
          %241 = llvm.insertvalue %239, %238[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %242 = llvm.insertvalue %240, %241[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<2 x i64>, array<2 x i64>)>
          %243 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %244 = llvm.insertvalue %11, %243[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %245 = llvm.insertvalue %11, %244[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %246 = llvm.insertvalue %233, %245[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %247 = llvm.mlir.constant(1 : index) : i64
          %248 = llvm.mlir.constant(4 : index) : i64
          %249 = llvm.mlir.constant(4 : index) : i64
          %250 = llvm.insertvalue %247, %246[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %251 = llvm.insertvalue %248, %250[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %252 = llvm.insertvalue %249, %251[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %253 = llvm.mlir.constant(512 : index) : i64
          %254 = llvm.insertvalue %253, %252[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %255 = llvm.mlir.constant(128 : index) : i64
          %256 = llvm.insertvalue %255, %254[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          %257 = llvm.mlir.constant(1 : index) : i64
          %258 = llvm.insertvalue %257, %256[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<3 x i64>, array<3 x i64>)>
          llvm.br ^bb6(%1 : i64)
        ^bb6(%259: i64):  // 2 preds: ^bb5, ^bb7
          %260 = llvm.icmp "slt" %259, %4 : i64
          llvm.cond_br %260, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %261 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %262 = llvm.insertvalue %11, %261[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %263 = llvm.insertvalue %11, %262[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %264 = llvm.mlir.constant(0 : i64) : i64
          %265 = llvm.mul %264, %253  : i64
          %266 = llvm.add %233, %265  : i64
          %267 = llvm.mul %259, %255  : i64
          %268 = llvm.add %266, %267  : i64
          %269 = llvm.mlir.constant(0 : i64) : i64
          %270 = llvm.mul %269, %257  : i64
          %271 = llvm.add %268, %270  : i64
          %272 = llvm.insertvalue %271, %263[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %273 = llvm.mlir.constant(4 : i64) : i64
          %274 = llvm.mlir.constant(1 : i64) : i64
          %275 = llvm.insertvalue %273, %272[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %276 = llvm.insertvalue %274, %275[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
          %277 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %278 = llvm.insertvalue %39, %277[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %279 = llvm.insertvalue %39, %278[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %280 = llvm.mlir.constant(0 : i64) : i64
          %281 = llvm.mul %280, %217  : i64
          %282 = llvm.add %203, %281  : i64
          %283 = llvm.mul %259, %219  : i64
          %284 = llvm.add %282, %283  : i64
          %285 = llvm.insertvalue %284, %279[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64)>
          %286 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %287 = llvm.insertvalue %11, %286[0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %288 = llvm.insertvalue %11, %287[1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %289 = llvm.insertvalue %271, %288[2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %290 = llvm.mlir.constant(1 : index) : i64
          %291 = llvm.mlir.constant(1 : index) : i64
          %292 = llvm.mlir.constant(1 : index) : i64
          %293 = llvm.mlir.constant(4 : index) : i64
          %294 = llvm.insertvalue %290, %289[3, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %295 = llvm.insertvalue %291, %294[3, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %296 = llvm.insertvalue %292, %295[3, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %297 = llvm.insertvalue %293, %296[3, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %298 = llvm.mlir.constant(4 : index) : i64
          %299 = llvm.insertvalue %298, %297[4, 0] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %300 = llvm.mlir.constant(4 : index) : i64
          %301 = llvm.insertvalue %300, %299[4, 1] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %302 = llvm.mlir.constant(4 : index) : i64
          %303 = llvm.insertvalue %302, %301[4, 2] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %304 = llvm.mlir.constant(1 : index) : i64
          %305 = llvm.insertvalue %304, %303[4, 3] : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<4 x i64>, array<4 x i64>)>
          %306 = llvm.mlir.constant(4 : index) : i64
          %307 = llvm.mul %1, %306  : i64
          %308 = llvm.add %271, %307  : i64
          %309 = llvm.mlir.constant(4 : index) : i64
          %310 = llvm.mul %1, %309  : i64
          %311 = llvm.add %308, %310  : i64
          %312 = llvm.mlir.constant(4 : index) : i64
          %313 = llvm.mul %1, %312  : i64
          %314 = llvm.add %311, %313  : i64
          %315 = llvm.add %314, %1  : i64
          %316 = llvm.getelementptr %11[%315] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %317 = llvm.bitcast %316 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %318 = llvm.load %317 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %319 = llvm.add %318, %106  : vector<4xi32>
          %320 = llvm.bitcast %77 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %319, %320 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %321 = llvm.mlir.constant(4 : index) : i64
          %322 = llvm.mul %1, %321  : i64
          %323 = llvm.add %322, %1  : i64
          %324 = llvm.getelementptr %77[%323] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %325 = llvm.bitcast %324 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %326 = llvm.load %325 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %327 = llvm.getelementptr %39[%284] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %328 = llvm.load %327 : !llvm.ptr<i32>
          %329 = llvm.mlir.undef : vector<1xi32>
          %330 = llvm.mlir.constant(0 : i32) : i32
          %331 = llvm.insertelement %328, %329[%330 : i32] : vector<1xi32>
          %332 = llvm.mlir.constant(0 : index) : i64
          %333 = llvm.extractelement %331[%332 : i64] : vector<1xi32>
          %334 = "llvm.intr.vector.reduce.add"(%326) : (vector<4xi32>) -> i32
          %335 = llvm.add %333, %334  : i32
          %336 = llvm.insertelement %335, %2[%1 : i64] : vector<1xi32>
          %337 = llvm.mlir.constant(0 : i64) : i64
          %338 = llvm.extractelement %336[%337 : i64] : vector<1xi32>
          %339 = llvm.mlir.undef : vector<1xi32>
          %340 = llvm.mlir.constant(0 : i32) : i32
          %341 = llvm.insertelement %338, %339[%340 : i32] : vector<1xi32>
          %342 = llvm.mlir.constant(0 : index) : i64
          %343 = llvm.extractelement %341[%342 : i64] : vector<1xi32>
          %344 = llvm.getelementptr %39[%284] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          llvm.store %343, %344 : !llvm.ptr<i32>
          %345 = llvm.add %259, %5  : i64
          llvm.br ^bb6(%345 : i64)
        ^bb8:  // pred: ^bb6
          %346 = llvm.add %221, %4  : i64
          llvm.br ^bb4(%346 : i64)
        ^bb9:  // pred: ^bb4
          %347 = llvm.add %160, %4  : i64
          llvm.br ^bb2(%347 : i64)
        ^bb10:  // pred: ^bb2
          %348 = llvm.add %158, %5  : i64
          llvm.br ^bb1(%348 : i64)
        ^bb11:  // pred: ^bb1
          %349 = llvm.mlir.constant(0 : i32) : i32
          llvm.return %349 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %36 = llvm.extractvalue %35[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %37 = llvm.zext %36 : i32 to i64
          %38 = llvm.mul %37, %16  : i64
          %39 = llvm.mul %34, %16  : i64
          %40 = llvm.mlir.null : !llvm.ptr<i32>
          %41 = llvm.getelementptr %40[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %42 = llvm.ptrtoint %41 : !llvm.ptr<i32> to i64
          %43 = llvm.alloca %42 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %44 = llvm.mul %11, %14  : i64
          %45 = llvm.mul %11, %14  : i64
          %46 = llvm.add %44, %45  : i64
          %47 = llvm.add %46, %11  : i64
          %48 = llvm.getelementptr %43[%47] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %49 = llvm.bitcast %48 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %49 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.bitcast %43 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %51 = llvm.load %50 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %52 = llvm.mul %38, %6  : i64
          %53 = llvm.add %52, %11  : i64
          %54 = llvm.mul %39, %15  : i64
          %55 = llvm.add %53, %54  : i64
          %56 = llvm.mul %38, %7  : i64
          %57 = llvm.add %56, %11  : i64
          %58 = llvm.mul %39, %13  : i64
          %59 = llvm.add %57, %58  : i64
          %60 = llvm.mul %9, %15  : i64
          %61 = llvm.add %59, %60  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%62: i64):  // 2 preds: ^bb0, ^bb10
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb2(%11 : i64), ^bb11
        ^bb2(%64: i64):  // 2 preds: ^bb1, ^bb9
          %65 = llvm.icmp "slt" %64, %16 : i64
          llvm.cond_br %65, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %66 = llvm.mul %62, %1  : i64
          %67 = llvm.add %61, %66  : i64
          %68 = llvm.mul %64, %2  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %9, %4  : i64
          %71 = llvm.add %69, %70  : i64
          %72 = llvm.mul %62, %3  : i64
          %73 = llvm.add %55, %72  : i64
          %74 = llvm.mul %64, %4  : i64
          %75 = llvm.add %73, %74  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%76: i64):  // 2 preds: ^bb3, ^bb8
          %77 = llvm.icmp "slt" %76, %13 : i64
          llvm.cond_br %77, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %78 = llvm.mul %9, %8  : i64
          %79 = llvm.add %71, %78  : i64
          %80 = llvm.mul %9, %13  : i64
          %81 = llvm.add %79, %80  : i64
          %82 = llvm.mul %76, %15  : i64
          %83 = llvm.add %81, %82  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%84: i64):  // 2 preds: ^bb5, ^bb7
          %85 = llvm.icmp "slt" %84, %14 : i64
          llvm.cond_br %85, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %86 = llvm.mul %9, %8  : i64
          %87 = llvm.add %83, %86  : i64
          %88 = llvm.mul %84, %13  : i64
          %89 = llvm.add %87, %88  : i64
          %90 = llvm.mul %9, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.mul %9, %14  : i64
          %93 = llvm.add %75, %92  : i64
          %94 = llvm.mul %84, %15  : i64
          %95 = llvm.add %93, %94  : i64
          %96 = llvm.mul %11, %14  : i64
          %97 = llvm.add %91, %96  : i64
          %98 = llvm.mul %11, %14  : i64
          %99 = llvm.add %97, %98  : i64
          %100 = llvm.mul %11, %14  : i64
          %101 = llvm.add %99, %100  : i64
          %102 = llvm.add %101, %11  : i64
          %103 = llvm.getelementptr %20[%102] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %104 = llvm.bitcast %103 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %105 = llvm.load %104 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %106 = llvm.add %105, %51  : vector<4xi32>
          %107 = llvm.bitcast %43 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %106, %107 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %108 = llvm.mul %11, %14  : i64
          %109 = llvm.add %108, %11  : i64
          %110 = llvm.getelementptr %43[%109] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %111 = llvm.bitcast %110 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %112 = llvm.load %111 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %113 = llvm.getelementptr %28[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %114 = llvm.load %113 : !llvm.ptr<i32>
          %115 = llvm.mlir.undef : vector<1xi32>
          %116 = llvm.insertelement %114, %115[%0 : i32] : vector<1xi32>
          %117 = llvm.extractelement %116[%11 : i64] : vector<1xi32>
          %118 = "llvm.intr.vector.reduce.add"(%112) : (vector<4xi32>) -> i32
          %119 = llvm.add %117, %118  : i32
          %120 = llvm.insertelement %119, %12[%11 : i64] : vector<1xi32>
          %121 = llvm.extractelement %120[%9 : i64] : vector<1xi32>
          %122 = llvm.mlir.undef : vector<1xi32>
          %123 = llvm.insertelement %121, %122[%0 : i32] : vector<1xi32>
          %124 = llvm.extractelement %123[%11 : i64] : vector<1xi32>
          %125 = llvm.getelementptr %28[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          llvm.store %124, %125 : !llvm.ptr<i32>
          %126 = llvm.add %84, %15  : i64
          llvm.br ^bb6(%126 : i64)
        ^bb8:  // pred: ^bb6
          %127 = llvm.add %76, %14  : i64
          llvm.br ^bb4(%127 : i64)
        ^bb9:  // pred: ^bb4
          %128 = llvm.add %64, %14  : i64
          llvm.br ^bb2(%128 : i64)
        ^bb10:  // pred: ^bb2
          %129 = llvm.add %62, %15  : i64
          llvm.br ^bb1(%129 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) ('hal.executable' operation: @_split_reduction_pass2_dispatch_0) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c67108864}
    %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c67108864}) {
      stream.cmd.fill %c1_i32, %arg0[%c0 for %c67108864] : i32 -> !stream.resource<transient>{%c67108864}
    } => !stream.timepoint
    %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c67108864}
    %3 = util.do_not_optimize(%2) : !stream.resource<transient>
    %4 = stream.resource.size %3 : !stream.resource<transient>
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1048576}
    %6 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%4}, %5 as %arg1: !stream.resource<external>{%c1048576}) {
      stream.cmd.concurrent {
        stream.cmd.dispatch @_split_reduction_pass2_dispatch_0::@_split_reduction_pass2_dispatch_0_generic_512x256x128[%c512, %c256, %c1] {
          ro %arg0[%c0 for %4] : !stream.resource<transient>{%4},
          wo %arg1[%c0 for %c1048576] : !stream.resource<external>{%c1048576}
        } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
        stream.cmd.fill %c128_i32, %arg1[%c524288 for %c524288] : i32 -> !stream.resource<external>{%c1048576}
      }
    } => !stream.timepoint
    %7 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c1048576}
    %8 = stream.resource.subview %7[%c0] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %9 = stream.resource.subview %7[%c524288] : !stream.resource<external>{%c1048576} -> !stream.resource<external>{%c524288}
    %10 = stream.tensor.export %8 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    %11 = stream.tensor.export %9 : tensor<512x256xi32> in !stream.resource<external>{%c524288} -> tensor<512x256xi32>
    check.expect_eq(%10, %11) : tensor<512x256xi32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass (iree-hal-conversion) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %device_0 = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %device_1 = hal.ex.shared_device : !hal.device
    %allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
    %buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %device_4 = hal.ex.shared_device : !hal.device
    %cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %2 = hal.command_buffer.device<%cmd_5 : !hal.command_buffer> : !hal.device
    hal.device.switch<%2 : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
      %c0_22 = arith.constant 0 : index
      %c1_23 = arith.constant 1 : index
      %c0_24 = arith.constant 0 : index
      hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_24] bindings([
        %c0_22 = (%1 : !hal.buffer)[%c0, %len],
        %c1_23 = (%buffer_3 : !hal.buffer)[%c0, %c1048576]
      ])
      %c1_25 = arith.constant 1 : index
      %c16 = arith.constant 16 : index
      %c8 = arith.constant 8 : index
      hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_25])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer>
    %3 = util.null : !hal.fence
    %fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence
    %c-1_i64_7 = arith.constant -1 : i64
    hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64_7) wait(%3) signal(%fence_6) commands([%cmd_5])
    %c-1_i32_8 = arith.constant -1 : i32
    %status_9 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32_8) : i32
    util.status.check_ok %status_9, "failed to wait on timepoint"
    %buffer_10 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c0, %c524288] : !hal.buffer
    %buffer_11 = hal.buffer.subspan<%buffer_3 : !hal.buffer>[%c524288, %c524288] : !hal.buffer
    %c512_12 = arith.constant 512 : index
    %c256_13 = arith.constant 256 : index
    %c0_14 = arith.constant 0 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %c1_i32_15 = arith.constant 1 : i32
    %view = hal.buffer_view.create buffer(%buffer_10 : !hal.buffer)[%c0_14, %c524288] shape([%c512_12, %c256_13]) type(%c268435488_i32) encoding(%c1_i32_15) : !hal.buffer_view
    %c512_16 = arith.constant 512 : index
    %c256_17 = arith.constant 256 : index
    %c0_18 = arith.constant 0 : index
    %c268435488_i32_19 = arith.constant 268435488 : i32
    %c1_i32_20 = arith.constant 1 : i32
    %view_21 = hal.buffer_view.create buffer(%buffer_11 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32_19) encoding(%c1_i32_20) : !hal.buffer_view
    check.expect_eq(%view, %view_21) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::FixupLegacySyncPass (iree-hal-fixup-legacy-sync) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %device_0 = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %status = hal.fence.await until([%0]) timeout_millis(%c-1_i32) : i32
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %c-1_i32_1 = arith.constant -1 : i32
    %status_2 = hal.fence.await until([%fence]) timeout_millis(%c-1_i32_1) : i32
    util.status.check_ok %status_2, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %device_3 = hal.ex.shared_device : !hal.device
    %allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
    %buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %device_6 = hal.ex.shared_device : !hal.device
    %cmd_7 = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %2 = hal.command_buffer.device<%cmd_7 : !hal.command_buffer> : !hal.device
    hal.device.switch<%2 : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %pipeline_layout = hal.pipeline_layout.lookup device(%2 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
      %c0_26 = arith.constant 0 : index
      %c1_27 = arith.constant 1 : index
      %c0_28 = arith.constant 0 : index
      hal.command_buffer.push_descriptor_set<%cmd_7 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0_28] bindings([
        %c0_26 = (%1 : !hal.buffer)[%c0, %len],
        %c1_27 = (%buffer_5 : !hal.buffer)[%c0, %c1048576]
      ])
      %c1_29 = arith.constant 1 : index
      %c16 = arith.constant 16 : index
      %c8 = arith.constant 8 : index
      hal.command_buffer.dispatch.symbol<%cmd_7 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1_29])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_7 : !hal.command_buffer> target(%buffer_5 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_7 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_7 : !hal.command_buffer>
    %3 = util.null : !hal.fence
    %fence_8 = hal.fence.create device(%device_6 : !hal.device) flags("None") : !hal.fence
    %c-1_i64_9 = arith.constant -1 : i64
    %c-1_i32_10 = arith.constant -1 : i32
    %status_11 = hal.fence.await until([%3]) timeout_millis(%c-1_i32_10) : i32
    hal.device.queue.execute<%device_6 : !hal.device> affinity(%c-1_i64_9) wait(%3) signal(%fence_8) commands([%cmd_7])
    %c-1_i32_12 = arith.constant -1 : i32
    %status_13 = hal.fence.await until([%fence_8]) timeout_millis(%c-1_i32_12) : i32
    util.status.check_ok %status_13, "failed to wait on timepoint"
    %buffer_14 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c0, %c524288] : !hal.buffer
    %buffer_15 = hal.buffer.subspan<%buffer_5 : !hal.buffer>[%c524288, %c524288] : !hal.buffer
    %c512_16 = arith.constant 512 : index
    %c256_17 = arith.constant 256 : index
    %c0_18 = arith.constant 0 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %c1_i32_19 = arith.constant 1 : i32
    %view = hal.buffer_view.create buffer(%buffer_14 : !hal.buffer)[%c0_18, %c524288] shape([%c512_16, %c256_17]) type(%c268435488_i32) encoding(%c1_i32_19) : !hal.buffer_view
    %c512_20 = arith.constant 512 : index
    %c256_21 = arith.constant 256 : index
    %c0_22 = arith.constant 0 : index
    %c268435488_i32_23 = arith.constant 268435488 : i32
    %c1_i32_24 = arith.constant 1 : i32
    %view_25 = hal.buffer_view.create buffer(%buffer_15 : !hal.buffer)[%c0_22, %c524288] shape([%c512_20, %c256_21]) type(%c268435488_i32_23) encoding(%c1_i32_24) : !hal.buffer_view
    check.expect_eq(%view, %view_25) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %device_0 = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %device_1 = hal.ex.shared_device : !hal.device
    %allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
    %buffer_3 = hal.allocator.allocate<%allocator_2 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %device_4 = hal.ex.shared_device : !hal.device
    %cmd_5 = hal.command_buffer.create device(%device_4 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.device.switch<%device_4 : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %pipeline_layout = hal.pipeline_layout.lookup device(%device_4 : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
      hal.command_buffer.push_descriptor_set<%cmd_5 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
        %c0 = (%1 : !hal.buffer)[%c0, %len],
        %c1 = (%buffer_3 : !hal.buffer)[%c0, %c1048576]
      ])
      hal.command_buffer.dispatch.symbol<%cmd_5 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_5 : !hal.command_buffer> target(%buffer_3 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_5 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_5 : !hal.command_buffer>
    %2 = util.null : !hal.fence
    %fence_6 = hal.fence.create device(%device_4 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device_4 : !hal.device> affinity(%c-1_i64) wait(%2) signal(%fence_6) commands([%cmd_5])
    %status_7 = hal.fence.await until([%fence_6]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_7, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_8 = hal.buffer_view.create buffer(%buffer_3 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_8) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
      hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
        %c0 = (%1 : !hal.buffer)[%c0, %len],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
      ])
      hal.command_buffer.dispatch.symbol<%cmd_1 : !hal.command_buffer> target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64::@_split_reduction_pass2_dispatch_0_generic_512x256x128) workgroups([%c8, %c16, %c1])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveExportOrdinalsPass (iree-hal-resolve-export-ordinals) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(#pipeline_layout) : !hal.pipeline_layout
      hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
        %c0 = (%1 : !hal.buffer)[%c0, %len],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
      ])
      %2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
      %exe = hal.executable.lookup device(%2 : !hal.device) executable(@_split_reduction_pass2_dispatch_0) : !hal.executable
      hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c8, %c16, %c1])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass (iree-hal-materialize-resource-caches) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.device.switch<%device : !hal.device> -> !hal.executable
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
      %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
      hal.return %exe : !hal.executable
    },
    #hal.match.always {
      %1 = util.null : !hal.executable
      hal.return %1 : !hal.executable
    }
    util.global.store %0, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
      hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
        %c0 = (%1 : !hal.buffer)[%c0, %len],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
      ])
      %2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
      %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
      hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = arith.constant true
    cf.cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    cf.br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
      hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
        %c0 = (%1 : !hal.buffer)[%c0, %len],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
      ])
      %2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
      %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
      hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
      hal.return
    }
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass (iree-hal-inline-device-switches) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = arith.constant true
    cf.cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    cf.br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    %2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    cf.br ^bb3
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  ^bb3:  // pred: ^bb1
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass (iree-hal-memoize-device-queries) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0_ok = util.global.load @_device_query_0_ok : i1
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = arith.constant true
    cf.cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    cf.br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %_device_query_0_ok = util.global.load @_device_query_0_ok : i1
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    %2 = hal.command_buffer.device<%cmd_1 : !hal.command_buffer> : !hal.device
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    cf.br ^bb3
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  ^bb3:  // pred: ^bb1
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.global.store %ok, @_device_query_0_ok : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.global.store %ok, @_device_query_0_ok : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.global.store %ok, @_device_query_0_ok : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %c268435488_i32 = arith.constant 268435488 : i32
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1048576 = arith.constant 1048576 : index
    %c0 = arith.constant 0 : index
    %c524288 = arith.constant 524288 : index
    %c67108864 = arith.constant 67108864 : index
    %c1_i32 = arith.constant 1 : i32
    %c128_i32 = arith.constant 128 : i32
    %c512 = arith.constant 512 : index
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.global.store %ok, @_device_query_0_ok : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    util.initializer.return
  }
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %device_0 = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device_1 = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device_2 = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    cf.br ^bb4
  ^bb4:  // pred: ^bb3
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %device_0 = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device_1 = hal.ex.shared_device : !hal.device
    %pipeline_layout = hal.pipeline_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device_2 = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device_2 : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.pipeline_layout
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('util.initializer' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    %1 = util.do_not_optimize(%buffer) : !hal.buffer
    %len = hal.buffer.length<%1 : !hal.buffer> : index
    %buffer_0 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c1048576}
    %cmd_1 = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd_1 : !hal.command_buffer> layout(%_pipeline_layout_0 : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%1 : !hal.buffer)[%c0, %len],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c1048576]
    ])
    hal.command_buffer.dispatch<%cmd_1 : !hal.command_buffer> target(%_executable__split_reduction_pass2_dispatch_0 : !hal.executable)[0] workgroups([%c8, %c16, %c1])
    hal.command_buffer.fill_buffer<%cmd_1 : !hal.command_buffer> target(%buffer_0 : !hal.buffer)[%c524288, %c524288] pattern(%c128_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd_1 : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd_1 : !hal.command_buffer>
    %fence_2 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%0) signal(%fence_2) commands([%cmd_1])
    %status_3 = hal.fence.await until([%fence_2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status_3, "failed to wait on timepoint"
    %view = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c0, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    %view_4 = hal.buffer_view.create buffer(%buffer_0 : !hal.buffer)[%c524288, %c524288] shape([%c512, %c256]) type(%c268435488_i32) encoding(%c1_i32) : !hal.buffer_view
    check.expect_eq(%view, %view_4) : !hal.buffer_view
    return
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
}


// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) ('func.func' operation: @_split_reduction_pass2) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_pipeline_layout_0 : !hal.pipeline_layout
  util.global private @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) flags("None") bindings([#hal.descriptor_set.binding<0, storage_buffer, ReadOnly>, #hal.descriptor_set.binding<1, storage_buffer>]) : !hal.descriptor_set_layout
    %pipeline_layout = hal.pipeline_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.pipeline_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %pipeline_layout, @_pipeline_layout_0 : !hal.pipeline_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@_split_reduction_pass2_dispatch_0::@embedded_elf_x86_64) layouts([%_pipeline_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    util.initializer.return
  }
  func.func @split_reduction_pass2() attributes {iree.abi.stub} {
    call @_split_reduction_pass2() : () -> ()
    return
  }
  hal.executable private @_split_reduction_pass2_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @_split_reduction_pass2_dispatch_0_generic_512x256x128 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c16 = arith.constant 16 : index
        %c8 = arith.constant 8 : index
        hal.return %c8, %c16, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @_split_reduction_pass2_dispatch_0_generic_512x256x128(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(32768 : i64) : i64
          %2 = llvm.mlir.constant(128 : i64) : i64
          %3 = llvm.mlir.constant(256 : i64) : i64
          %4 = llvm.mlir.constant(1 : i64) : i64
          %5 = llvm.mlir.constant(63 : index) : i64
          %6 = llvm.mlir.constant(256 : index) : i64
          %7 = llvm.mlir.constant(32768 : index) : i64
          %8 = llvm.mlir.constant(512 : index) : i64
          %9 = llvm.mlir.constant(0 : i64) : i64
          %10 = llvm.mlir.constant(dense<0> : vector<4xi32>) : vector<4xi32>
          %11 = llvm.mlir.constant(0 : index) : i64
          %12 = llvm.mlir.constant(dense<0> : vector<1xi32>) : vector<1xi32>
          %13 = llvm.mlir.constant(128 : index) : i64
          %14 = llvm.mlir.constant(4 : index) : i64
          %15 = llvm.mlir.constant(1 : index) : i64
          %16 = llvm.mlir.constant(32 : index) : i64
          %17 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %19 = llvm.load %18 : !llvm.ptr<ptr<i8>>
          %20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %21 = llvm.ptrtoint %20 : !llvm.ptr<i32> to i64
          %22 = llvm.and %21, %5  : i64
          %23 = llvm.icmp "eq" %22, %11 : i64
          "llvm.intr.assume"(%23) : (i1) -> ()
          %24 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %26 = llvm.getelementptr %25[1] : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<ptr<i8>>
          %27 = llvm.load %26 : !llvm.ptr<ptr<i8>>
          %28 = llvm.bitcast %27 : !llvm.ptr<i8> to !llvm.ptr<i32>
          %29 = llvm.ptrtoint %28 : !llvm.ptr<i32> to i64
          %30 = llvm.and %29, %5  : i64
          %31 = llvm.icmp "eq" %30, %11 : i64
          "llvm.intr.assume"(%31) : (i1) -> ()
          %32 = llvm.load %arg2 : !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>>
          %33 = llvm.extractvalue %32[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %34 = llvm.zext %33 : i32 to i64
          %35 = llvm.extractvalue %32[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>
          %36 = llvm.zext %35 : i32 to i64
          %37 = llvm.mul %36, %16  : i64
          %38 = llvm.mul %34, %16  : i64
          %39 = llvm.mlir.null : !llvm.ptr<i32>
          %40 = llvm.getelementptr %39[4] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
          %41 = llvm.ptrtoint %40 : !llvm.ptr<i32> to i64
          %42 = llvm.alloca %41 x i32 {alignment = 128 : i64} : (i64) -> !llvm.ptr<i32>
          %43 = llvm.mul %11, %14  : i64
          %44 = llvm.add %43, %43  : i64
          %45 = llvm.add %44, %11  : i64
          %46 = llvm.getelementptr %42[%45] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %47 = llvm.bitcast %46 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          llvm.store %10, %47 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %48 = llvm.bitcast %42 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %49 = llvm.load %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %50 = llvm.mul %37, %6  : i64
          %51 = llvm.add %50, %11  : i64
          %52 = llvm.mul %38, %15  : i64
          %53 = llvm.add %51, %52  : i64
          %54 = llvm.mul %37, %7  : i64
          %55 = llvm.add %54, %11  : i64
          %56 = llvm.mul %38, %13  : i64
          %57 = llvm.add %55, %56  : i64
          %58 = llvm.mul %9, %15  : i64
          %59 = llvm.add %57, %58  : i64
          llvm.br ^bb1(%11 : i64)
        ^bb1(%60: i64):  // 2 preds: ^bb0, ^bb10
          %61 = llvm.icmp "slt" %60, %16 : i64
          llvm.cond_br %61, ^bb2(%11 : i64), ^bb11
        ^bb2(%62: i64):  // 2 preds: ^bb1, ^bb9
          %63 = llvm.icmp "slt" %62, %16 : i64
          llvm.cond_br %63, ^bb3, ^bb10
        ^bb3:  // pred: ^bb2
          %64 = llvm.mul %60, %1  : i64
          %65 = llvm.add %59, %64  : i64
          %66 = llvm.mul %62, %2  : i64
          %67 = llvm.add %65, %66  : i64
          %68 = llvm.mul %9, %4  : i64
          %69 = llvm.add %67, %68  : i64
          %70 = llvm.mul %60, %3  : i64
          %71 = llvm.add %53, %70  : i64
          %72 = llvm.mul %62, %4  : i64
          %73 = llvm.add %71, %72  : i64
          llvm.br ^bb4(%11 : i64)
        ^bb4(%74: i64):  // 2 preds: ^bb3, ^bb8
          %75 = llvm.icmp "slt" %74, %13 : i64
          llvm.cond_br %75, ^bb5, ^bb9
        ^bb5:  // pred: ^bb4
          %76 = llvm.mul %9, %8  : i64
          %77 = llvm.add %69, %76  : i64
          %78 = llvm.mul %9, %13  : i64
          %79 = llvm.add %77, %78  : i64
          %80 = llvm.mul %74, %15  : i64
          %81 = llvm.add %79, %80  : i64
          llvm.br ^bb6(%11 : i64)
        ^bb6(%82: i64):  // 2 preds: ^bb5, ^bb7
          %83 = llvm.icmp "slt" %82, %14 : i64
          llvm.cond_br %83, ^bb7, ^bb8
        ^bb7:  // pred: ^bb6
          %84 = llvm.add %81, %76  : i64
          %85 = llvm.mul %82, %13  : i64
          %86 = llvm.add %84, %85  : i64
          %87 = llvm.add %86, %58  : i64
          %88 = llvm.mul %9, %14  : i64
          %89 = llvm.add %73, %88  : i64
          %90 = llvm.mul %82, %15  : i64
          %91 = llvm.add %89, %90  : i64
          %92 = llvm.add %87, %43  : i64
          %93 = llvm.add %92, %43  : i64
          %94 = llvm.add %93, %43  : i64
          %95 = llvm.add %94, %11  : i64
          %96 = llvm.getelementptr %20[%95] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %97 = llvm.bitcast %96 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %98 = llvm.load %97 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %99 = llvm.add %98, %49  : vector<4xi32>
          llvm.store %99, %48 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %100 = llvm.add %43, %11  : i64
          %101 = llvm.getelementptr %42[%100] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %102 = llvm.bitcast %101 : !llvm.ptr<i32> to !llvm.ptr<vector<4xi32>>
          %103 = llvm.load %102 {alignment = 4 : i64} : !llvm.ptr<vector<4xi32>>
          %104 = llvm.getelementptr %28[%91] : (!llvm.ptr<i32>, i64) -> !llvm.ptr<i32>
          %105 = llvm.load %104 : !llvm.ptr<i32>
          %106 = llvm.mlir.undef : vector<1xi32>
          %107 = llvm.insertelement %105, %106[%0 : i32] : vector<1xi32>
          %108 = llvm.extractelement %107[%11 : i64] : vector<1xi32>
          %109 = "llvm.intr.vector.reduce.add"(%103) : (vector<4xi32>) -> i32
          %110 = llvm.add %108, %109  : i32
          %111 = llvm.insertelement %110, %12[%11 : i64] : vector<1xi32>
          %112 = llvm.extractelement %111[%9 : i64] : vector<1xi32>
          %113 = llvm.insertelement %112, %106[%0 : i32] : vector<1xi32>
          %114 = llvm.extractelement %113[%11 : i64] : vector<1xi32>
          llvm.store %114, %104 : !llvm.ptr<i32>
          %115 = llvm.add %82, %15  : i64
          llvm.br ^bb6(%115 : i64)
        ^bb8:  // pred: ^bb6
          %116 = llvm.add %74, %14  : i64
          llvm.br ^bb4(%116 : i64)
        ^bb9:  // pred: ^bb4
          %117 = llvm.add %62, %14  : i64
          llvm.br ^bb2(%117 : i64)
        ^bb10:  // pred: ^bb2
          %118 = llvm.add %60, %15  : i64
          llvm.br ^bb1(%118 : i64)
        ^bb11:  // pred: ^bb1
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func private @_split_reduction_pass2() {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_pipeline_layout_0 = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
    %_executable__split_reduction_pass2_dispatch_0 = util.global.load @_executable__split_reduction_pass2_dispatch_0 : !hal.executable
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %c512 = arith.constant 512 : index
    %c128_i32 = arith.constant 128 : i32
    %c1_i32 = arith.constant 1 : i32
    %c67108864 = arith.constant 67108864 : index
    %c524288 = arith.constant 524288 : index
    %c0 = arith.constant 0 : index
    %c1048576 = arith.constant 1048576 : index
    %c-1_i64 = arith.constant -1 : i64
    %c-1_i32 = arith.constant -1 : i32
    %c16 = arith.constant 16 : index
    %c8 = arith.constant 8 : index
    %c268435488_i32 = arith.constant 268435488 : i32
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c67108864}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
    hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c67108864] pattern(%c1_i32 : i32)
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %0 = util.null : !hal.fence