Skip to content

Instantly share code, notes, and snippets.

@dc1394
Last active January 4, 2024 09:19
Show Gist options
  • Save dc1394/6984fa72e03c02ffb28e1a37c88c3d29 to your computer and use it in GitHub Desktop.
Save dc1394/6984fa72e03c02ffb28e1a37c88c3d29 to your computer and use it in GitHub Desktop.
Twitterのモンテカルロ法のC++版の速度比較コード(dSFMT-AVX512使用+手動でSIMD (AVX-512)ベクトル化)
#include <array> // for std::array
#include <iomanip> // for std::setprecision
#include <ios> // for std::ios::fixed, std::ios::floatfield
#include <iostream> // for std::cout, std::endl
#define HAVE_SSE2
#define HAVE_AVX2
#include "dSFMT.h"
#include "dSFMT-2203-avx512.h"
namespace {
static auto constexpr ARRAYSIZE = 64;
static auto constexpr RANDSIZE = ARRAYSIZE / 2;
inline double mcpi();
}
int main()
{
std::cout.setf(std::ios::fixed, std::ios::floatfield);
std::cout << "pi = "
<< std::setprecision(16)
<< mcpi()
<< std::endl;
return 0;
}
namespace {
double mcpi()
{
auto constexpr seed = 20231226;
auto constexpr num_points = 1000000000;
auto num_inside = 0;
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, seed);
alignas(64) std::array<double, ARRAYSIZE> randarray;
auto const loopnum = num_points / (RANDSIZE / 2);
for (auto i = 0; i < loopnum; i++) {
dsfmt_fill_array_close_open(&dsfmt, randarray.data(), RANDSIZE);
auto mask = _mm512_set1_epi32(0);
auto const x_values = _mm512_load_pd(randarray.data());
auto const y_values = _mm512_load_pd(&randarray[8]);
auto const x_squared = _mm512_mul_pd(x_values, x_values);
auto const y_squared = _mm512_mul_pd(y_values, y_values);
auto const inside_mask = _mm512_cmp_pd_mask(_mm512_add_pd(x_squared, y_squared), _mm512_set1_pd(1.0), _MM_CMPINT_LT);
// Update mask with the result
mask = _mm512_mask_add_epi32(mask, inside_mask, mask, _mm512_set1_epi32(1));
// Sum the counts in the mask
num_inside += _mm512_reduce_add_epi32(mask);
auto mask2 = _mm512_set1_epi32(0);
auto const x_values2 = _mm512_load_pd(&randarray[16]);
auto const y_values2 = _mm512_load_pd(&randarray[24]);
auto const x_squared2 = _mm512_mul_pd(x_values2, x_values2);
auto const y_squared2 = _mm512_mul_pd(y_values2, y_values2);
auto const inside_mask2 = _mm512_cmp_pd_mask(_mm512_add_pd(x_squared2, y_squared2), _mm512_set1_pd(1.0), _MM_CMPINT_LT);
// Update mask with the result
mask2 = _mm512_mask_add_epi32(mask2, inside_mask2, mask2, _mm512_set1_epi32(1));
// Sum the counts in the mask
num_inside += _mm512_reduce_add_epi32(mask2);
}
return 4.0 * static_cast<double>(num_inside) / static_cast<double>(num_points);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment