Created
July 2, 2018 00:39
-
-
Save k163377/803eae80fcbdf4f337760bfea8dc9ed6 to your computer and use it in GitHub Desktop.
Microsoft PPL/並列STL(C++17)/OpenMPを少しだけ比較+AVX2利用
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> //時間計測その他諸々 | |
#include <cstdio> | |
#include <iostream> | |
#include <random> | |
#include <ppl.h> //parallel_for_each用 | |
#include <algorithm> //for_each用 | |
#include <execution> //algorithmの並列実行ポリシー | |
#include <omp.h> //OpenMP用 | |
#include <immintrin.h> //AVX2用 | |
#define num 1000000000 | |
#define threadnum 8; | |
inline double sqr(double d) { return d * d; } | |
inline bool check(double x, double y) { return (sqr(x) + sqr(y) > 1.0); } | |
double Single() { //単一スレッド | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
constexpr long L = num / tnum; | |
long count = 0; | |
for (int i = 0; i < tnum; i++) { | |
for (long j = 0; j < L; j++) { | |
if (check(uniform(mts[i]), uniform(mts[i]))) { count++; } | |
} | |
} | |
return (double)(num - count) / (double)(num >> 2); | |
} | |
double PPL() { //concurrency::parallel_for_each | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
constexpr long L = num / threadnum; | |
concurrency::combinable<long> count; | |
concurrency::parallel_for_each(begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) { | |
//concurrency::parallel_for(0, tnum, [&count, &mts, uniform, L](int i) { //こっちのが4~5秒ぐらい遅い | |
long c = 0; | |
for (long j = 0; j < L; j++) { | |
if (check(uniform(mt), uniform(mt))) { c++; } | |
} | |
count.local() += c; | |
}); | |
return ((double)(num - count.combine(std::plus<long>())) / (double)(num >> 2)); | |
} | |
double PPL_Simd() { //concurrency::parallel_for_each | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
constexpr long L = (num >> 2) / threadnum; | |
const __m256d one = _mm256_set1_pd(1.0); | |
concurrency::combinable<double> count; | |
concurrency::parallel_for_each(begin(mts), end(mts), [&count, L, one](mt19937_64 mt) { | |
int t; | |
double d[4]; | |
__m256d vx, vy, vnorm, vans; | |
vans = _mm256_setzero_pd(); | |
double* ans = reinterpret_cast<double*>(&vans); | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
for (long j = 0; j < L; j++) { | |
//乱数で取って | |
for (t = 0; t < 4; t++) d[t] = uniform(mt); | |
vx = _mm256_load_pd(d); | |
for (t = 0; t < 4; t++) d[t] = uniform(mt); | |
vy = _mm256_load_pd(d); | |
//ノルム | |
vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy)); | |
//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する | |
vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one)); | |
} | |
count.local() += ans[0] + ans[1] + ans[2] + ans[3]; | |
}); | |
return (count.combine(std::plus<double>()) / (double)(num >> 2)); | |
} | |
double STL() { //c++17の並列for_each | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
constexpr long L = num / threadnum; | |
atomic_long count = 0; | |
for_each(execution::par_unseq, begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) { | |
long c = 0; | |
for (long j = 0; j < L; j++) { | |
if (check(uniform(mt), uniform(mt))) { c++; } | |
} | |
count += c; | |
}); | |
return ((double)(num - count) / (double)(num >> 2)); | |
} | |
double OMP() { //OpenMP | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
constexpr long L = num / tnum; | |
long count = 0; | |
#pragma omp parallel for private(uniform) | |
for (int i = 0; i < tnum; i++) { | |
long c = 0; | |
for (long j = 0; j < L; j++) { | |
if (check(uniform(mts[i]), uniform(mts[i]))) { | |
c++; | |
} | |
} | |
#pragma omp atomic | |
count += c; | |
} | |
return (double)(num - count) / (double)(num >> 2); | |
} | |
double OMP_Simd() { //OpenMP + AVX2 | |
using namespace std; | |
mt19937 m = mt19937((int)time(NULL)); | |
constexpr int tnum = threadnum; | |
mt19937_64 mts[tnum]; | |
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); } | |
constexpr long L = (num/tnum) >> 2; | |
double count = 0.0; | |
const __m256d one = _mm256_set1_pd(1.0); | |
#pragma omp parallel for | |
for (int i = 0; i < tnum; i++) { | |
int t; | |
double d[4]; | |
__m256d vx, vy, vnorm, vans; | |
vans = _mm256_setzero_pd(); | |
double* ans = reinterpret_cast<double*>(&vans); | |
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0); | |
for (long j = 0; j < L; j++) { | |
//乱数で取って | |
for (t = 0; t < 4; t++) d[t] = uniform(mts[i]); | |
vx = _mm256_load_pd(d); | |
for (t = 0; t < 4; t++) d[t] = uniform(mts[i]); | |
vy = _mm256_load_pd(d); | |
//ノルム | |
vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy)); | |
//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する | |
vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one)); | |
} | |
#pragma omp atomic | |
count += ans[0] + ans[1] + ans[2] + ans[3]; | |
} | |
return count / (double)(num >> 2); | |
} | |
int main() { | |
using namespace std; | |
clock_t s, e; | |
double pi; | |
s = clock(); | |
pi = Single(); | |
e = clock(); | |
printf("StNs\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi); | |
s = clock(); | |
pi = PPL(); | |
e = clock(); | |
printf("PPL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi); | |
/*s = clock(); | |
pi = PPL_Simd(); | |
e = clock(); | |
printf("PPL_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/ | |
s = clock(); | |
pi = STL(); | |
e = clock(); | |
printf("STL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi); | |
s = clock(); | |
pi = OMP(); | |
e = clock(); | |
printf("OMP\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi); | |
/*s = clock(); | |
pi = OMP_Simd(); | |
e = clock(); | |
printf("OMP_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/ | |
cin >> pi; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
以下の記事の内容
http://wrongwrong163377.hatenablog.com/entry/2018/07/02/003000