yc0/parallel_accumulate.cpp

## parallel_accumulate.cpp
/**
MacOS : clang++ -Wall -std=c++17 parallel_accumulate.cpp -O3 -o out
----------------------------
num of cores : (8) w/o task switching
1166029 (ns)
the anwser : 45000497
2564850 (ns)
the anwser : 45000497
----------------------------
**/
#include <iostream>
#include <thread>
#include <numeric>
#include <vector>
#include <time.h>

#define MIN_BLOCK_SIZE 1000
template <typename iterator, typename T>
void accumulate(iterator start, iterator end, T &ref)
{
    ref += std::accumulate(start, end, 0);
}
template <typename iterator, typename T>
T parallel_accumulate(iterator start, iterator end, T &ref)
{
    int sz = std::distance(start, end);
    int allowed_threads_by_elements = (sz)/MIN_BLOCK_SIZE;
    int aloowed_threads_by_hardware = std::thread::hardware_concurrency();
    int num_threads = std::min(allowed_threads_by_elements, aloowed_threads_by_hardware);


    int block_size = (sz+1) / num_threads;
    std::vector<T> rst(num_threads);
    std::vector<std::thread> threads(num_threads-1);

    iterator last;
    for(int i=0; i < num_threads-1; i++) {
        last = start;
        std::advance(last, block_size);
        threads[i] = std::thread(accumulate<iterator, T>,start, last, std::ref(rst[i]));
        start = last;
    }

    rst[num_threads-1] = std::accumulate(start, end, 0);
    std::for_each( std::begin(threads), std::end(threads), std::mem_fn(&std::thread::join));
    return std::accumulate(std::begin(rst), std::end(rst), ref);
}


int main() {
    srand(time(NULL));

    const int sz = 10000000;
    int *data = new int[sz];

    for(int i=0; i < sz; i++)
        data[i] = rand() %10;

    std::cout << "num of cores : (" << std::thread::hardware_concurrency()
        << ") w/o task switching" << std::endl;

    int ans = 0,
        ref  = 0;
    auto start = std::chrono::high_resolution_clock::now();
    ans = parallel_accumulate<int*, int>((int *) data, reinterpret_cast<int *>(data + sz), ref);
    auto finish = std::chrono::high_resolution_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n";
    std::cout << "the anwser : " << ans << std::endl;


    ref = 0;
    ans = 0;
    start = std::chrono::high_resolution_clock::now();
    ans = std::accumulate((int *) data, reinterpret_cast<int *>(data + sz), ref);
    finish = std::chrono::high_resolution_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n";
    std::cout << "the anwser : " << ans << std::endl;

    delete [] data;
}
	/**
	MacOS : clang++ -Wall -std=c++17 parallel_accumulate.cpp -O3 -o out
	----------------------------
	num of cores : (8) w/o task switching
	1166029 (ns)
	the anwser : 45000497
	2564850 (ns)
	the anwser : 45000497
	----------------------------
	**/
	#include <iostream>
	#include <thread>
	#include <numeric>
	#include <vector>
	#include <time.h>

	#define MIN_BLOCK_SIZE 1000
	template <typename iterator, typename T>
	void accumulate(iterator start, iterator end, T &ref)
	{
	ref += std::accumulate(start, end, 0);
	}
	template <typename iterator, typename T>
	T parallel_accumulate(iterator start, iterator end, T &ref)
	{
	int sz = std::distance(start, end);
	int allowed_threads_by_elements = (sz)/MIN_BLOCK_SIZE;
	int aloowed_threads_by_hardware = std::thread::hardware_concurrency();
	int num_threads = std::min(allowed_threads_by_elements, aloowed_threads_by_hardware);


	int block_size = (sz+1) / num_threads;
	std::vector<T> rst(num_threads);
	std::vector<std::thread> threads(num_threads-1);

	iterator last;
	for(int i=0; i < num_threads-1; i++) {
	last = start;
	std::advance(last, block_size);
	threads[i] = std::thread(accumulate<iterator, T>,start, last, std::ref(rst[i]));
	start = last;
	}

	rst[num_threads-1] = std::accumulate(start, end, 0);
	std::for_each( std::begin(threads), std::end(threads), std::mem_fn(&std::thread::join));
	return std::accumulate(std::begin(rst), std::end(rst), ref);
	}



	int main() {
	srand(time(NULL));

	const int sz = 10000000;
	int *data = new int[sz];

	for(int i=0; i < sz; i++)
	data[i] = rand() %10;

	std::cout << "num of cores : (" << std::thread::hardware_concurrency()
	<< ") w/o task switching" << std::endl;

	int ans = 0,
	ref = 0;
	auto start = std::chrono::high_resolution_clock::now();
	ans = parallel_accumulate<int, int>((int ) data, reinterpret_cast<int *>(data + sz), ref);
	auto finish = std::chrono::high_resolution_clock::now();
	std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n";
	std::cout << "the anwser : " << ans << std::endl;


	ref = 0;
	ans = 0;
	start = std::chrono::high_resolution_clock::now();
	ans = std::accumulate((int ) data, reinterpret_cast<int >(data + sz), ref);
	finish = std::chrono::high_resolution_clock::now();
	std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n";
	std::cout << "the anwser : " << ans << std::endl;

	delete [] data;
	}