Skip to content

Instantly share code, notes, and snippets.

@dc1394
Created December 31, 2023 13:25
Show Gist options
  • Save dc1394/9dcbd93253a363a80da49d148718b425 to your computer and use it in GitHub Desktop.
Save dc1394/9dcbd93253a363a80da49d148718b425 to your computer and use it in GitHub Desktop.
Twitterのモンテカルロ法のC++版の速度比較コード(Xoshiro256PlusSIMD使用+手動でSIMD (AVX2)ベクトル化)
#include <cstdint> // for std::uint32_t
#include <iomanip> // for std::setprecision
#include <ios> // for std::ios::fixed, std::ios::floatfield
#include <iostream> // for std::cout, std::endl
#define __AVX2_AVAILABLE__
#include "SIMDInstructionSet.h"
#include "Xoshiro256Plus.h"
namespace {
inline double mcpi();
}
int main()
{
std::cout.setf(std::ios::fixed, std::ios::floatfield);
std::cout << "pi = "
<< std::setprecision(16)
<< mcpi()
<< std::endl;
}
namespace {
double mcpi()
{
using Xoshiro256PlusAVX2 = SEFUtility::RNG::Xoshiro256Plus<SIMDInstructionSet::AVX2>;
auto constexpr seed = 20231226;
auto constexpr num_points = 1000000000;
auto num_inside = 0;
Xoshiro256PlusAVX2 avx_rng(seed);
auto const loopnum = num_points / 4;
for (auto i = 0; i < loopnum; i++) {
// Load 8 random numbers (4 for x and 4 for y) using AVX
auto const next4_avx1 = avx_rng.dnext4();
auto const next4_avx2 = avx_rng.dnext4();
// Separate x and y components
auto const x = _mm256_permute_pd(next4_avx1, 0b0000);
auto const y = _mm256_permute_pd(next4_avx2, 0b1111);
// Calculate r2 for the first 4 points
auto const r2 = _mm256_add_pd(_mm256_mul_pd(x, x), _mm256_mul_pd(y, y));
// Compare r2 with 1.0 and increment num_inside accordingly
auto const mask = _mm256_cmp_pd(r2, _mm256_set1_pd(1.0), _CMP_LT_OS);
std::uint32_t const mask_int = _mm256_movemask_pd(mask);
num_inside += _mm_popcnt_u32(mask_int);
}
return 4.0 * static_cast<double>(num_inside) / static_cast<double>(num_points);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment