RiccardoRossi/main.cpp

## main.cpp
#include <vector>
#include <array>
#include <iostream>
#include <chrono>
#include <cassert>
#include <cmath>
#include <omp.h>

template<
        class TContainerType,
        class TIteratorType=typename TContainerType::iterator,
        int TMaxThreads=128
        >
class BlockPartition
{
public:

    /** @param it_begin - iterator pointing at the beginning of the container
     *  @param it_end - iterator pointing to the end of the container
     *  @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
     */
    BlockPartition(TIteratorType it_begin,
                   TIteratorType it_end,
                   int Nchunks = omp_get_max_threads())
    {
        //KRATOS_ERROR_IF(Nchunks < 1) << "Number of chunks must be > 0 (and not " << Nchunks << ")" << std::endl;

        const std::ptrdiff_t size_container = it_end-it_begin;

        if (size_container == 0) {
            mNchunks = Nchunks;
        } else {
            // in case the container is smaller than the number of chunks
            mNchunks = std::min(static_cast<int>(size_container), Nchunks);
        }
        const std::ptrdiff_t block_partition_size = size_container / mNchunks;
        mBlockPartition[0] = it_begin;
        mBlockPartition[mNchunks] = it_end;
        for (int i=1; i<mNchunks; i++) {
            mBlockPartition[i] = mBlockPartition[i-1] + block_partition_size;
        }
    }

    /** @param rData - the continer to be iterated upon
     *  @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
     */
    BlockPartition(TContainerType& rData,
                   int Nchunks = omp_get_max_threads())
        : BlockPartition(rData.begin(), rData.end(), Nchunks)
    {}

    virtual ~BlockPartition() = default;

    /** @brief simple iteration loop. f called on every entry in rData
     * @param f - must be a unary function accepting as input TContainerType::value_type&
     */
    template <class TUnaryFunction>
    inline void for_each(TUnaryFunction&& f)
    {
        #pragma omp parallel for
        for (int i=0; i<mNchunks; ++i) {
            for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
                f(*it); //note that we pass the value to the function, not the iterator
            }
        }
    }

    /** @brief loop allowing reductions. f called on every entry in rData
     * the function f needs to return the values to be used by the reducer
     * @param TReducer template parameter specifying the reduction operation to be done
     * @param f - must be a unary function accepting as input TContainerType::value_type&
     */
    template <class TReducer, class TUnaryFunction>
    inline typename TReducer::value_type for_each(TUnaryFunction &&f)
    {
        TReducer global_reducer;
        #pragma omp parallel for
        for (int i=0; i<mNchunks; ++i) {
            TReducer local_reducer;
            for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
                local_reducer.LocalReduce(f(*it));
            }
            global_reducer.ThreadSafeReduce(local_reducer);
        }
        return global_reducer.GetValue();
    }

    /** @brief loop with thread local storage (TLS). f called on every entry in rData
     * @param TThreadLocalStorage template parameter specifying the thread local storage
     * @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
     */
    template <class TThreadLocalStorage, class TFunction>
    inline void for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
    {
        static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");

        #pragma omp parallel
        {
            // copy the prototype to create the thread local storage
            TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);

            #pragma omp for
            for(int i=0; i<mNchunks; ++i){
                for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it){
                    f(*it, thread_local_storage); // note that we pass the value to the function, not the iterator
                }
            }
        }
    }

    /** @brief loop with thread local storage (TLS) allowing reductions. f called on every entry in rData
     * the function f needs to return the values to be used by the reducer
     * @param TReducer template parameter specifying the reduction operation to be done
     * @param TThreadLocalStorage template parameter specifying the thread local storage
     * @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
     */
    template <class TReducer, class TThreadLocalStorage, class TFunction>
    inline typename TReducer::value_type for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
    {
        static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");

        TReducer global_reducer;

        #pragma omp parallel
        {
            // copy the prototype to create the thread local storage
            TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);

            #pragma omp for
            for (int i=0; i<mNchunks; ++i) {
                TReducer local_reducer;
                for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
                    local_reducer.LocalReduce(f(*it, thread_local_storage));
                }
                global_reducer.ThreadSafeReduce(local_reducer);
            }
        }
        return global_reducer.GetValue();
    }

private:
    int mNchunks;
    std::array<TIteratorType, TMaxThreads> mBlockPartition;
};

/** @brief simplified version of the basic loop (without reduction) to enable template type deduction
 * @param v - containers to be looped upon
 * @param func - must be a unary function accepting as input TContainerType::value_type&
 */
template <class TContainerType, class TFunctionType>
void block_for_each(TContainerType &&v, TFunctionType &&func)
{
    BlockPartition<typename std::decay<TContainerType>::type>(std::forward<TContainerType>(v)).for_each(std::forward<TFunctionType>(func));
}


int main() {

    std::vector<double> a{1.0,2.0,3.0};
    const auto& const_a = a;

    block_for_each(a, [&](double& item)
    {
        std::cout << item << std::endl;
    });

    // block_for_each(const_a, [&](const double& item)
    // {
    //     std::cout << item << std::endl;
    // });


    return 0;


}
	#include <vector>
	#include <array>
	#include <iostream>
	#include <chrono>
	#include <cassert>
	#include <cmath>
	#include <omp.h>

	template<
	class TContainerType,
	class TIteratorType=typename TContainerType::iterator,
	int TMaxThreads=128
	>
	class BlockPartition
	{
	public:

	/** @param it_begin - iterator pointing at the beginning of the container
	* @param it_end - iterator pointing to the end of the container
	* @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
	*/
	BlockPartition(TIteratorType it_begin,
	TIteratorType it_end,
	int Nchunks = omp_get_max_threads())
	{
	//KRATOS_ERROR_IF(Nchunks < 1) << "Number of chunks must be > 0 (and not " << Nchunks << ")" << std::endl;

	const std::ptrdiff_t size_container = it_end-it_begin;

	if (size_container == 0) {
	mNchunks = Nchunks;
	} else {
	// in case the container is smaller than the number of chunks
	mNchunks = std::min(static_cast<int>(size_container), Nchunks);
	}
	const std::ptrdiff_t block_partition_size = size_container / mNchunks;
	mBlockPartition[0] = it_begin;
	mBlockPartition[mNchunks] = it_end;
	for (int i=1; i<mNchunks; i++) {
	mBlockPartition[i] = mBlockPartition[i-1] + block_partition_size;
	}
	}

	/** @param rData - the continer to be iterated upon
	* @param Nchunks - number of threads to be used in the loop (must be lower than TMaxThreads)
	*/
	BlockPartition(TContainerType& rData,
	int Nchunks = omp_get_max_threads())
	: BlockPartition(rData.begin(), rData.end(), Nchunks)
	{}

	virtual ~BlockPartition() = default;

	/** @brief simple iteration loop. f called on every entry in rData
	* @param f - must be a unary function accepting as input TContainerType::value_type&
	*/
	template <class TUnaryFunction>
	inline void for_each(TUnaryFunction&& f)
	{
	#pragma omp parallel for
	for (int i=0; i<mNchunks; ++i) {
	for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
	f(*it); //note that we pass the value to the function, not the iterator
	}
	}
	}

	/** @brief loop allowing reductions. f called on every entry in rData
	* the function f needs to return the values to be used by the reducer
	* @param TReducer template parameter specifying the reduction operation to be done
	* @param f - must be a unary function accepting as input TContainerType::value_type&
	*/
	template <class TReducer, class TUnaryFunction>
	inline typename TReducer::value_type for_each(TUnaryFunction &&f)
	{
	TReducer global_reducer;
	#pragma omp parallel for
	for (int i=0; i<mNchunks; ++i) {
	TReducer local_reducer;
	for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
	local_reducer.LocalReduce(f(*it));
	}
	global_reducer.ThreadSafeReduce(local_reducer);
	}
	return global_reducer.GetValue();
	}

	/** @brief loop with thread local storage (TLS). f called on every entry in rData
	* @param TThreadLocalStorage template parameter specifying the thread local storage
	* @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
	*/
	template <class TThreadLocalStorage, class TFunction>
	inline void for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
	{
	static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");

	#pragma omp parallel
	{
	// copy the prototype to create the thread local storage
	TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);

	#pragma omp for
	for(int i=0; i<mNchunks; ++i){
	for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it){
	f(*it, thread_local_storage); // note that we pass the value to the function, not the iterator
	}
	}
	}
	}

	/** @brief loop with thread local storage (TLS) allowing reductions. f called on every entry in rData
	* the function f needs to return the values to be used by the reducer
	* @param TReducer template parameter specifying the reduction operation to be done
	* @param TThreadLocalStorage template parameter specifying the thread local storage
	* @param f - must be a function accepting as input TContainerType::value_type& and the thread local storage
	*/
	template <class TReducer, class TThreadLocalStorage, class TFunction>
	inline typename TReducer::value_type for_each(const TThreadLocalStorage& rThreadLocalStoragePrototype, TFunction &&f)
	{
	static_assert(std::is_copy_constructible<TThreadLocalStorage>::value, "TThreadLocalStorage must be copy constructible!");

	TReducer global_reducer;

	#pragma omp parallel
	{
	// copy the prototype to create the thread local storage
	TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);

	#pragma omp for
	for (int i=0; i<mNchunks; ++i) {
	TReducer local_reducer;
	for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
	local_reducer.LocalReduce(f(*it, thread_local_storage));
	}
	global_reducer.ThreadSafeReduce(local_reducer);
	}
	}
	return global_reducer.GetValue();
	}

	private:
	int mNchunks;
	std::array<TIteratorType, TMaxThreads> mBlockPartition;
	};

	/** @brief simplified version of the basic loop (without reduction) to enable template type deduction
	* @param v - containers to be looped upon
	* @param func - must be a unary function accepting as input TContainerType::value_type&
	*/
	template <class TContainerType, class TFunctionType>
	void block_for_each(TContainerType &&v, TFunctionType &&func)
	{
	BlockPartition<typename std::decay<TContainerType>::type>(std::forward<TContainerType>(v)).for_each(std::forward<TFunctionType>(func));
	}



	int main() {

	std::vector<double> a{1.0,2.0,3.0};
	const auto& const_a = a;

	block_for_each(a, [&](double& item)
	{
	std::cout << item << std::endl;
	});

	// block_for_each(const_a, [&](const double& item)
	// {
	// std::cout << item << std::endl;
	// });


	return 0;


	}