Skip to content

Instantly share code, notes, and snippets.

@RiccardoRossi
Created February 9, 2021 06:57
Show Gist options
  • Save RiccardoRossi/af1681d7118682b5a1f39f8a14c0ea3b to your computer and use it in GitHub Desktop.
Save RiccardoRossi/af1681d7118682b5a1f39f8a14c0ea3b to your computer and use it in GitHub Desktop.
#include <vector>
#include <array>
#include <iostream>
#include <chrono>
#include <cassert>
#include <cmath>
#include <omp.h>
template<
class TContainerType,
class TIteratorType=typename TContainerType::iterator,
int TMaxThreads=128
>
class BlockPartition
{
public:
/** @param it_begin - iterator pointing at the beginning of the container
* @param it_end - iterator pointing to the end of the container
* @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
*/
BlockPartition(TIteratorType it_begin,
TIteratorType it_end,
int Nchunks = omp_get_max_threads())
{
//KRATOS_ERROR_IF(Nchunks < 1) << "Number of chunks must be > 0 (and not " << Nchunks << ")" << std::endl;
const std::ptrdiff_t size_container = it_end-it_begin;
if (size_container == 0) {
mNchunks = Nchunks;
} else {
// in case the container is smaller than the number of chunks
mNchunks = std::min(static_cast<int>(size_container), Nchunks);
}
const std::ptrdiff_t block_partition_size = size_container / mNchunks;
mBlockPartition[0] = it_begin;
mBlockPartition[mNchunks] = it_end;
for (int i=1; i<mNchunks; i++) {
mBlockPartition[i] = mBlockPartition[i-1] + block_partition_size;
}
}
/** @param rData - the continer to be iterated upon
* @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
*/
BlockPartition(TContainerType& rData,
int Nchunks = omp_get_max_threads())
: BlockPartition(rData.begin(), rData.end(), Nchunks)
{}
virtual ~BlockPartition() = default;
/** @brief simple iteration loop. f called on every entry in rData
* @param f - must be a unary function accepting as input TContainerType::value_type&
*/
template <class TUnaryFunction>
inline void for_each(TUnaryFunction&& f)
{
#pragma omp parallel for
for (int i=0; i<mNchunks; ++i) {
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
f(*it); //note that we pass the value to the function, not the iterator
}
}
}
/** @brief loop allowing reductions. f called on every entry in rData
* the function f needs to return the values to be used by the reducer
* @param TReducer template parameter specifying the reduction operation to be done
* @param f - must be a unary function accepting as input TContainerType::value_type&
*/
template <class TReducer, class TUnaryFunction>
inline typename TReducer::value_type for_each(TUnaryFunction &&f)
{
TReducer global_reducer;
#pragma omp parallel for
for (int i=0; i<mNchunks; ++i) {
TReducer local_reducer;
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
local_reducer.LocalReduce(f(*it));
}
global_reducer.ThreadSafeReduce(local_reducer);
}
return global_reducer.GetValue();
}
/** @brief loop with thread local storage (TLS). f called on every entry in rData
* @param TThreadLocalStorage template parameter specifying the thread local storage
* @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
*/
template <class TThreadLocalStorage, class TFunction>
inline void for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
{
static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");
#pragma omp parallel
{
// copy the prototype to create the thread local storage
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
#pragma omp for
for(int i=0; i<mNchunks; ++i){
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it){
f(*it, thread_local_storage); // note that we pass the value to the function, not the iterator
}
}
}
}
/** @brief loop with thread local storage (TLS) allowing reductions. f called on every entry in rData
* the function f needs to return the values to be used by the reducer
* @param TReducer template parameter specifying the reduction operation to be done
* @param TThreadLocalStorage template parameter specifying the thread local storage
* @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
*/
template <class TReducer, class TThreadLocalStorage, class TFunction>
inline typename TReducer::value_type for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
{
static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");
TReducer global_reducer;
#pragma omp parallel
{
// copy the prototype to create the thread local storage
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
#pragma omp for
for (int i=0; i<mNchunks; ++i) {
TReducer local_reducer;
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
local_reducer.LocalReduce(f(*it, thread_local_storage));
}
global_reducer.ThreadSafeReduce(local_reducer);
}
}
return global_reducer.GetValue();
}
private:
int mNchunks;
std::array<TIteratorType, TMaxThreads> mBlockPartition;
};
/** @brief simplified version of the basic loop (without reduction) to enable template type deduction
* @param v - containers to be looped upon
* @param func - must be a unary function accepting as input TContainerType::value_type&
*/
template <class TContainerType, class TFunctionType>
void block_for_each(TContainerType &&v, TFunctionType &&func)
{
BlockPartition<typename std::decay<TContainerType>::type>(std::forward<TContainerType>(v)).for_each(std::forward<TFunctionType>(func));
}
int main() {
std::vector<double> a{1.0,2.0,3.0};
const auto& const_a = a;
block_for_each(a, [&](double& item)
{
std::cout << item << std::endl;
});
// block_for_each(const_a, [&](const double& item)
// {
// std::cout << item << std::endl;
// });
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment