k163377/main.cpp

## main.cpp
#include <chrono> //時間計測その他諸々
#include <cstdio>
#include <iostream>
#include <random>

#include <ppl.h> //parallel_for_each用

#include <algorithm> //for_each用
#include <execution> //algorithmの並列実行ポリシー

#include <omp.h> //OpenMP用

#include <immintrin.h> //AVX2用

#define num 1000000000
#define threadnum 8;

inline double sqr(double d) { return d * d; }
inline bool check(double x, double y) { return (sqr(x) + sqr(y) > 1.0); }

double Single() { //単一スレッド
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / tnum;
	long count = 0;
	for (int i = 0; i < tnum; i++) {
		for (long j = 0; j < L; j++) {
			if (check(uniform(mts[i]), uniform(mts[i]))) { count++; }
		}
	}

	return (double)(num - count) / (double)(num >> 2);
}

double PPL() { //concurrency::parallel_for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / threadnum;
	concurrency::combinable<long> count;
	concurrency::parallel_for_each(begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
	//concurrency::parallel_for(0, tnum, [&count, &mts, uniform, L](int i) { //こっちのが4～5秒ぐらい遅い
		long c = 0;
		for (long j = 0; j < L; j++) {
			if (check(uniform(mt), uniform(mt))) { c++; }
		}
		count.local() += c;
	});

	return ((double)(num - count.combine(std::plus<long>())) / (double)(num >> 2));
}

double PPL_Simd() { //concurrency::parallel_for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }

	constexpr long L = (num >> 2) / threadnum;
	const __m256d one = _mm256_set1_pd(1.0);
	concurrency::combinable<double> count;

	concurrency::parallel_for_each(begin(mts), end(mts), [&count, L, one](mt19937_64 mt) {
		int t;
		double d[4];
		__m256d vx, vy, vnorm, vans;
		vans = _mm256_setzero_pd();
		double* ans = reinterpret_cast<double*>(&vans);
		uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

		for (long j = 0; j < L; j++) {
			//乱数で取って
			for (t = 0; t < 4; t++) d[t] = uniform(mt);
			vx = _mm256_load_pd(d);
			for (t = 0; t < 4; t++) d[t] = uniform(mt);
			vy = _mm256_load_pd(d);
			//ノルム
			vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
			//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
			vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
		}
		count.local() += ans[0] + ans[1] + ans[2] + ans[3];
	});

	return (count.combine(std::plus<double>()) / (double)(num >> 2));
}

double STL() { //c++17の並列for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / threadnum;
	atomic_long count = 0;
	for_each(execution::par_unseq, begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
		long c = 0;
		for (long j = 0; j < L; j++) {
			if (check(uniform(mt), uniform(mt))) { c++; }
		}
		count += c;
	});

	return ((double)(num - count) / (double)(num >> 2));
}

double OMP() { //OpenMP
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / tnum;
	long count = 0;

	#pragma omp parallel for private(uniform)
	for (int i = 0; i < tnum; i++) {
		long c = 0;
		for (long j = 0; j < L; j++) {
			if (check(uniform(mts[i]), uniform(mts[i]))) {
				c++;
			}
		}
		#pragma omp atomic
		count += c;
	}

	return (double)(num - count) / (double)(num >> 2);
}

double OMP_Simd() { //OpenMP + AVX2
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }

	constexpr long L = (num/tnum) >> 2;
	double count = 0.0;
	const __m256d one = _mm256_set1_pd(1.0);

	#pragma omp parallel for
	for (int i = 0; i < tnum; i++) {
		int t;
		double d[4];
		__m256d vx, vy, vnorm, vans;
		vans = _mm256_setzero_pd();
		double* ans = reinterpret_cast<double*>(&vans);
		uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

		for (long j = 0; j < L; j++) {
			//乱数で取って
			for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
			vx = _mm256_load_pd(d);
			for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
			vy = _mm256_load_pd(d);
			//ノルム
			vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
			//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
			vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
		}

		#pragma omp atomic
		count += ans[0] + ans[1] + ans[2] + ans[3];
	}

	return count / (double)(num >> 2);
}

int main() {
	using namespace std;

	clock_t s, e;
	double pi;

	s = clock();
	pi = Single();
	e = clock();
	printf("StNs\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	s = clock();
	pi = PPL();
	e = clock();
	printf("PPL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	/*s = clock();
	pi = PPL_Simd();
	e = clock();
	printf("PPL_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/

	s = clock();
	pi = STL();
	e = clock();
	printf("STL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	s = clock();
	pi = OMP();
	e = clock();
	printf("OMP\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	/*s = clock();
	pi = OMP_Simd();
	e = clock();
	printf("OMP_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/

	cin >> pi;

	return 0;
}
	#include <chrono> //時間計測その他諸々
	#include <cstdio>
	#include <iostream>
	#include <random>

	#include <ppl.h> //parallel_for_each用

	#include <algorithm> //for_each用
	#include <execution> //algorithmの並列実行ポリシー

	#include <omp.h> //OpenMP用

	#include <immintrin.h> //AVX2用

	#define num 1000000000
	#define threadnum 8;

	inline double sqr(double d) { return d * d; }
	inline bool check(double x, double y) { return (sqr(x) + sqr(y) > 1.0); }

	double Single() { //単一スレッド
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / tnum;
	long count = 0;
	for (int i = 0; i < tnum; i++) {
	for (long j = 0; j < L; j++) {
	if (check(uniform(mts[i]), uniform(mts[i]))) { count++; }
	}
	}

	return (double)(num - count) / (double)(num >> 2);
	}

	double PPL() { //concurrency::parallel_for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / threadnum;
	concurrency::combinable<long> count;
	concurrency::parallel_for_each(begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
	//concurrency::parallel_for(0, tnum, [&count, &mts, uniform, L](int i) { //こっちのが4～5秒ぐらい遅い
	long c = 0;
	for (long j = 0; j < L; j++) {
	if (check(uniform(mt), uniform(mt))) { c++; }
	}
	count.local() += c;
	});

	return ((double)(num - count.combine(std::plus<long>())) / (double)(num >> 2));
	}

	double PPL_Simd() { //concurrency::parallel_for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }

	constexpr long L = (num >> 2) / threadnum;
	const __m256d one = _mm256_set1_pd(1.0);
	concurrency::combinable<double> count;

	concurrency::parallel_for_each(begin(mts), end(mts), [&count, L, one](mt19937_64 mt) {
	int t;
	double d[4];
	__m256d vx, vy, vnorm, vans;
	vans = _mm256_setzero_pd();
	double* ans = reinterpret_cast<double*>(&vans);
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	for (long j = 0; j < L; j++) {
	//乱数で取って
	for (t = 0; t < 4; t++) d[t] = uniform(mt);
	vx = _mm256_load_pd(d);
	for (t = 0; t < 4; t++) d[t] = uniform(mt);
	vy = _mm256_load_pd(d);
	//ノルム
	vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
	//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
	vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
	}
	count.local() += ans[0] + ans[1] + ans[2] + ans[3];
	});

	return (count.combine(std::plus<double>()) / (double)(num >> 2));
	}

	double STL() { //c++17の並列for_each
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / threadnum;
	atomic_long count = 0;
	for_each(execution::par_unseq, begin(mts), end(mts), [&count, uniform, L](mt19937_64 mt) {
	long c = 0;
	for (long j = 0; j < L; j++) {
	if (check(uniform(mt), uniform(mt))) { c++; }
	}
	count += c;
	});

	return ((double)(num - count) / (double)(num >> 2));
	}

	double OMP() { //OpenMP
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	constexpr long L = num / tnum;
	long count = 0;

	#pragma omp parallel for private(uniform)
	for (int i = 0; i < tnum; i++) {
	long c = 0;
	for (long j = 0; j < L; j++) {
	if (check(uniform(mts[i]), uniform(mts[i]))) {
	c++;
	}
	}
	#pragma omp atomic
	count += c;
	}

	return (double)(num - count) / (double)(num >> 2);
	}

	double OMP_Simd() { //OpenMP + AVX2
	using namespace std;

	mt19937 m = mt19937((int)time(NULL));
	constexpr int tnum = threadnum;
	mt19937_64 mts[tnum];
	for (mt19937_64 &mt : mts) { mt = mt19937_64(m()); }

	constexpr long L = (num/tnum) >> 2;
	double count = 0.0;
	const __m256d one = _mm256_set1_pd(1.0);

	#pragma omp parallel for
	for (int i = 0; i < tnum; i++) {
	int t;
	double d[4];
	__m256d vx, vy, vnorm, vans;
	vans = _mm256_setzero_pd();
	double* ans = reinterpret_cast<double*>(&vans);
	uniform_real_distribution<> uniform = uniform_real_distribution<>(0.0, 1.0);

	for (long j = 0; j < L; j++) {
	//乱数で取って
	for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
	vx = _mm256_load_pd(d);
	for (t = 0; t < 4; t++) d[t] = uniform(mts[i]);
	vy = _mm256_load_pd(d);
	//ノルム
	vnorm = _mm256_add_pd(_mm256_mul_pd(vx, vx), _mm256_mul_pd(vy, vy));
	//比較してマスクを取り、これと1のandを取ることで外側を0にして、加算する
	vans = _mm256_add_pd(vans, _mm256_and_pd(_mm256_cmp_pd(vnorm, one, _CMP_LE_OS), one));
	}

	#pragma omp atomic
	count += ans[0] + ans[1] + ans[2] + ans[3];
	}

	return count / (double)(num >> 2);
	}

	int main() {
	using namespace std;

	clock_t s, e;
	double pi;

	s = clock();
	pi = Single();
	e = clock();
	printf("StNs\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	s = clock();
	pi = PPL();
	e = clock();
	printf("PPL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	/*s = clock();
	pi = PPL_Simd();
	e = clock();
	printf("PPL_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/

	s = clock();
	pi = STL();
	e = clock();
	printf("STL\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	s = clock();
	pi = OMP();
	e = clock();
	printf("OMP\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);

	/*s = clock();
	pi = OMP_Simd();
	e = clock();
	printf("OMP_sim\t: %f sec.\tpi = %f\n", (float)(e - s) / CLOCKS_PER_SEC, pi);*/

	cin >> pi;

	return 0;
	}