Last active
September 11, 2019 08:08
-
-
Save yc0/a94a057ee75ef7279be32ae699014a97 to your computer and use it in GitHub Desktop.
Modern C++ Concurrency in Depth
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
MacOS : clang++ -Wall -std=c++17 parallel_accumulate.cpp -O3 -o out | |
---------------------------- | |
num of cores : (8) w/o task switching | |
1166029 (ns) | |
the anwser : 45000497 | |
2564850 (ns) | |
the anwser : 45000497 | |
---------------------------- | |
**/ | |
#include <iostream> | |
#include <thread> | |
#include <numeric> | |
#include <vector> | |
#include <time.h> | |
#define MIN_BLOCK_SIZE 1000 | |
template <typename iterator, typename T> | |
void accumulate(iterator start, iterator end, T &ref) | |
{ | |
ref += std::accumulate(start, end, 0); | |
} | |
template <typename iterator, typename T> | |
T parallel_accumulate(iterator start, iterator end, T &ref) | |
{ | |
int sz = std::distance(start, end); | |
int allowed_threads_by_elements = (sz)/MIN_BLOCK_SIZE; | |
int aloowed_threads_by_hardware = std::thread::hardware_concurrency(); | |
int num_threads = std::min(allowed_threads_by_elements, aloowed_threads_by_hardware); | |
int block_size = (sz+1) / num_threads; | |
std::vector<T> rst(num_threads); | |
std::vector<std::thread> threads(num_threads-1); | |
iterator last; | |
for(int i=0; i < num_threads-1; i++) { | |
last = start; | |
std::advance(last, block_size); | |
threads[i] = std::thread(accumulate<iterator, T>,start, last, std::ref(rst[i])); | |
start = last; | |
} | |
rst[num_threads-1] = std::accumulate(start, end, 0); | |
std::for_each( std::begin(threads), std::end(threads), std::mem_fn(&std::thread::join)); | |
return std::accumulate(std::begin(rst), std::end(rst), ref); | |
} | |
int main() { | |
srand(time(NULL)); | |
const int sz = 10000000; | |
int *data = new int[sz]; | |
for(int i=0; i < sz; i++) | |
data[i] = rand() %10; | |
std::cout << "num of cores : (" << std::thread::hardware_concurrency() | |
<< ") w/o task switching" << std::endl; | |
int ans = 0, | |
ref = 0; | |
auto start = std::chrono::high_resolution_clock::now(); | |
ans = parallel_accumulate<int*, int>((int *) data, reinterpret_cast<int *>(data + sz), ref); | |
auto finish = std::chrono::high_resolution_clock::now(); | |
std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n"; | |
std::cout << "the anwser : " << ans << std::endl; | |
ref = 0; | |
ans = 0; | |
start = std::chrono::high_resolution_clock::now(); | |
ans = std::accumulate((int *) data, reinterpret_cast<int *>(data + sz), ref); | |
finish = std::chrono::high_resolution_clock::now(); | |
std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count() << " (ns)\n"; | |
std::cout << "the anwser : " << ans << std::endl; | |
delete [] data; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment