Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Possible implementation of std::reduce via executors
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <experimental/execution>
#include <execution>
#include <numeric>
#include <algorithm>
#include <tuple>
#include <iterator>
template<class Iterator, class T, class BinaryOperation>
T reduce(std::execution::sequenced_policy, Iterator first, Iterator last, T init, BinaryOperation binary_op)
{
return std::accumulate(first, last, init, binary_op);
}
template<class ExecutionPolicy, class RandomAccessIterator, class T, class BinaryOperation>
T reduce(ExecutorPolicy&& policy, RandomAccessIterator first, RandomAccessIterator last, T init, BinaryOperation binary_op)
{
using namespace std::experimental;
// create a view of the input
//auto input = make_iterator_range(first, last);
// divide the input into a number of tiles approximately equal to the executor's unit_shape
//auto tiles = tile_evenly(input, execution::query(execution::unit_shape, policy.executor()));
// XXX ideally, we'd partition the input into a number of tiles proportional to the "unit_shape" of the executor
// the idea behind this property is somewhat analogous to what std::thread::hardware_concurrency() reports
// for example, a thread pool executor would probably return the number of threads in the pool
// since we don't have such a property, arbitrarily choose 16
size_t desired_num_tiles = 16;
size_t tile_size = (std::distance(first, last) + desired_num_tiles - 1) / desired_num_tiles;
size_t num_tiles = (std::distance(first, last) + tile_size - 1) / tile_size;
// XXX could use the executor's associated allocator for this vector
using partial_sums_type = std::vector<T>;
auto ex = execution::require(policy.executor(), execution::bulk, execution::twoway);
partial_sums_type partial_sums = ex.bulk_twoway_execute(
[=](size_t i, partial_sums_type& partial_sums, auto&) {
//// get this agent's tile
//auto this_tile = tiles[i];
//// compute the sum of this tile
//partial_sums[i] = std::reduce(execution::seq, this_tile.begin() + 1, this_tile.end(), this_tile[0], binary_op);
// get this agent's tile
auto my_first = first + tile_size * i;
auto my_last = std::min(my_first + tile_size, last);
// compute the sum of this tile
partial_sums[i] = std::reduce(std::execution::seq, my_first + 1, my_last, *my_first, binary_op);
},
num_partial_sums,
[=]{ return partial_sums_type(num_tiles); }, // the result factory creates a vector of partial sums
[]{ return std::ignore; } // the shared factory creates nothing interesting
).get();
// XXX another option would be to put this sequenced reduction inside a .then_execute and wait on its resulting future
// return the sum of partial sums, execute in this thread
return std::reduce(std::execution::seq, partial_sums.begin(), partial_sums.end(), init, binary_op);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.