jaredhoberock/reduce.hpp

## reduce.hpp
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <experimental/execution>
#include <execution>
#include <numeric>
#include <algorithm>
#include <tuple>
#include <iterator>

template<class Iterator, class T, class BinaryOperation>
T reduce(std::execution::sequenced_policy, Iterator first, Iterator last, T init, BinaryOperation binary_op)
{
  return std::accumulate(first, last, init, binary_op);
}

template<class ExecutionPolicy, class RandomAccessIterator, class T, class BinaryOperation>
T reduce(ExecutorPolicy&& policy, RandomAccessIterator first, RandomAccessIterator last, T init, BinaryOperation binary_op)
{
  using namespace std::experimental;

  // create a view of the input
  //auto input = make_iterator_range(first, last);

  // divide the input into a number of tiles approximately equal to the executor's unit_shape
  //auto tiles = tile_evenly(input, execution::query(execution::unit_shape, policy.executor()));

  // XXX ideally, we'd partition the input into a number of tiles proportional to the "unit_shape" of the executor
  //     the idea behind this property is somewhat analogous to what std::thread::hardware_concurrency() reports
  //     for example, a thread pool executor would probably return the number of threads in the pool
  //     since we don't have such a property, arbitrarily choose 16

  size_t desired_num_tiles = 16;
  size_t tile_size = (std::distance(first, last) + desired_num_tiles - 1) / desired_num_tiles;
  size_t num_tiles = (std::distance(first, last) + tile_size - 1) / tile_size;

  // XXX could use the executor's associated allocator for this vector
  using partial_sums_type = std::vector<T>;

  auto ex = execution::require(policy.executor(), execution::bulk, execution::twoway);

  partial_sums_type partial_sums = ex.bulk_twoway_execute(
    [=](size_t i, partial_sums_type& partial_sums, auto&) {
      //// get this agent's tile
      //auto this_tile = tiles[i];

      //// compute the sum of this tile
      //partial_sums[i] = std::reduce(execution::seq, this_tile.begin() + 1, this_tile.end(), this_tile[0], binary_op);

      // get this agent's tile
      auto my_first = first + tile_size * i;
      auto my_last = std::min(my_first + tile_size, last);

      // compute the sum of this tile
      partial_sums[i] = std::reduce(std::execution::seq, my_first + 1, my_last, *my_first, binary_op);
    },
    num_partial_sums,
    [=]{ return partial_sums_type(num_tiles); }, // the result factory creates a vector of partial sums
    []{ return std::ignore; }                    // the shared factory creates nothing interesting
  ).get();

  // XXX another option would be to put this sequenced reduction inside a .then_execute and wait on its resulting future

  // return the sum of partial sums, execute in this thread
  return std::reduce(std::execution::seq, partial_sums.begin(), partial_sums.end(), init, binary_op);
}
	// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	// * Neither the name of NVIDIA CORPORATION nor the names of its
	// contributors may be used to endorse or promote products derived
	// from this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#include <experimental/execution>
	#include <execution>
	#include <numeric>
	#include <algorithm>
	#include <tuple>
	#include <iterator>

	template<class Iterator, class T, class BinaryOperation>
	T reduce(std::execution::sequenced_policy, Iterator first, Iterator last, T init, BinaryOperation binary_op)
	{
	return std::accumulate(first, last, init, binary_op);
	}

	template<class ExecutionPolicy, class RandomAccessIterator, class T, class BinaryOperation>
	T reduce(ExecutorPolicy&& policy, RandomAccessIterator first, RandomAccessIterator last, T init, BinaryOperation binary_op)
	{
	using namespace std::experimental;

	// create a view of the input
	//auto input = make_iterator_range(first, last);

	// divide the input into a number of tiles approximately equal to the executor's unit_shape
	//auto tiles = tile_evenly(input, execution::query(execution::unit_shape, policy.executor()));

	// XXX ideally, we'd partition the input into a number of tiles proportional to the "unit_shape" of the executor
	// the idea behind this property is somewhat analogous to what std::thread::hardware_concurrency() reports
	// for example, a thread pool executor would probably return the number of threads in the pool
	// since we don't have such a property, arbitrarily choose 16

	size_t desired_num_tiles = 16;
	size_t tile_size = (std::distance(first, last) + desired_num_tiles - 1) / desired_num_tiles;
	size_t num_tiles = (std::distance(first, last) + tile_size - 1) / tile_size;

	// XXX could use the executor's associated allocator for this vector
	using partial_sums_type = std::vector<T>;

	auto ex = execution::require(policy.executor(), execution::bulk, execution::twoway);

	partial_sums_type partial_sums = ex.bulk_twoway_execute(
	[=](size_t i, partial_sums_type& partial_sums, auto&) {
	//// get this agent's tile
	//auto this_tile = tiles[i];

	//// compute the sum of this tile
	//partial_sums[i] = std::reduce(execution::seq, this_tile.begin() + 1, this_tile.end(), this_tile[0], binary_op);

	// get this agent's tile
	auto my_first = first + tile_size * i;
	auto my_last = std::min(my_first + tile_size, last);

	// compute the sum of this tile
	partial_sums[i] = std::reduce(std::execution::seq, my_first + 1, my_last, *my_first, binary_op);
	},
	num_partial_sums,
	[=]{ return partial_sums_type(num_tiles); }, // the result factory creates a vector of partial sums
	[]{ return std::ignore; } // the shared factory creates nothing interesting
	).get();

	// XXX another option would be to put this sequenced reduction inside a .then_execute and wait on its resulting future

	// return the sum of partial sums, execute in this thread
	return std::reduce(std::execution::seq, partial_sums.begin(), partial_sums.end(), init, binary_op);
	}