Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@k163377
Created July 2, 2018 00:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k163377/803eae80fcbdf4f337760bfea8dc9ed6 to your computer and use it in GitHub Desktop.
Save k163377/803eae80fcbdf4f337760bfea8dc9ed6 to your computer and use it in GitHub Desktop.
Microsoft PPL/並列STL(C++17)/OpenMPを少しだけ比較+AVX2利用
#include <chrono> //時間計測その他諸々
#include <cstdio>
#include <iostream>
#include <random>
#include <ppl.h> //parallel_for_each用
#include <algorithm> //for_each用
#include <execution> //algorithmの並列実行ポリシー
#include <omp.h> //OpenMP用
#include <immintrin.h> //AVX2用
#define num 1000000000
#define threadnum 8;
inline double sqr(double d) { return d * d; }
inline bool check(double x, double y) { return (sqr(x) + sqr(y) > 1.0); }
double Single() { //単一スレッド
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
constexpr long L = num / tnum;
long count = 0;
for (int i = 0; i < tnum; i++) {
for (long j = 0; j < L; j++) {
if (check(uniform(mts[i]), uniform(mts[i]))) { count++; }
}
}
return (double)(num - count) / (double)(num >> 2);
}
double PPL() { //concurrency::parallel_for_each
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
constexpr long L = num / threadnum;
concurrency::combinable<long> count;
concurrency::parallel_for_each(begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
//concurrency::parallel_for(0, tnum, [&count, &mts, uniform, L](int i) { //こっちのが4~5秒ぐらい遅い
long c = 0;
for (long j = 0; j < L; j++) {
if (check(uniform(mt), uniform(mt))) { c++; }
}
count.local() += c;
});
return ((double)(num - count.combine(std::plus<long>())) / (double)(num >> 2));
}
double PPL_Simd() { //concurrency::parallel_for_each
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
constexpr long L = (num >> 2) / threadnum;
const __m256d one = _mm256_set1_pd(1.0);
concurrency::combinable<double> count;
concurrency::parallel_for_each(begin(mts), end(mts), [&count, L, one](mt19937_64 mt) {
int t;
double d[4];
__m256d vx, vy, vnorm, vans;
vans = _mm256_setzero_pd();
double* ans = reinterpret_cast<double*>(&vans);
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
for (long j = 0; j < L; j++) {
//乱数で取って
for (t = 0; t < 4; t++) d[t] = uniform(mt);
vx = _mm256_load_pd(d);
for (t = 0; t < 4; t++) d[t] = uniform(mt);
vy = _mm256_load_pd(d);
//ノルム
vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
}
count.local() += ans[0] + ans[1] + ans[2] + ans[3];
});
return (count.combine(std::plus<double>()) / (double)(num >> 2));
}
double STL() { //c++17の並列for_each
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
constexpr long L = num / threadnum;
atomic_long count = 0;
for_each(execution::par_unseq, begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
long c = 0;
for (long j = 0; j < L; j++) {
if (check(uniform(mt), uniform(mt))) { c++; }
}
count += c;
});
return ((double)(num - count) / (double)(num >> 2));
}
double OMP() { //OpenMP
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
constexpr long L = num / tnum;
long count = 0;
#pragma omp parallel for private(uniform)
for (int i = 0; i < tnum; i++) {
long c = 0;
for (long j = 0; j < L; j++) {
if (check(uniform(mts[i]), uniform(mts[i]))) {
c++;
}
}
#pragma omp atomic
count += c;
}
return (double)(num - count) / (double)(num >> 2);
}
double OMP_Simd() { //OpenMP + AVX2
using namespace std;
mt19937 m = mt19937((int)time(NULL));
constexpr int tnum = threadnum;
mt19937_64 mts[tnum];
for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
constexpr long L = (num/tnum) >> 2;
double count = 0.0;
const __m256d one = _mm256_set1_pd(1.0);
#pragma omp parallel for
for (int i = 0; i < tnum; i++) {
int t;
double d[4];
__m256d vx, vy, vnorm, vans;
vans = _mm256_setzero_pd();
double* ans = reinterpret_cast<double*>(&vans);
uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);
for (long j = 0; j < L; j++) {
//乱数で取って
for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
vx = _mm256_load_pd(d);
for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
vy = _mm256_load_pd(d);
//ノルム
vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
}
#pragma omp atomic
count += ans[0] + ans[1] + ans[2] + ans[3];
}
return count / (double)(num >> 2);
}
int main() {
using namespace std;
clock_t s, e;
double pi;
s = clock();
pi = Single();
e = clock();
printf("StNs\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);
s = clock();
pi = PPL();
e = clock();
printf("PPL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);
/*s = clock();
pi = PPL_Simd();
e = clock();
printf("PPL_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/
s = clock();
pi = STL();
e = clock();
printf("STL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);
s = clock();
pi = OMP();
e = clock();
printf("OMP\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);
/*s = clock();
pi = OMP_Simd();
e = clock();
printf("OMP_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/
cin >> pi;
return 0;
}
@k163377
Copy link
Author

k163377 commented Jul 2, 2018

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment