Skip to content

Instantly share code, notes, and snippets.

@sakamoto-poteko
Created December 11, 2015 03:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sakamoto-poteko/9d45efde2a1f728a6f6c to your computer and use it in GitHub Desktop.
Save sakamoto-poteko/9d45efde2a1f728a6f6c to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <stdio.h>
#include <immintrin.h>
const static __m256i SEED_MUL = _mm256_set1_epi32(214013);
const static __m256i SEED_ADDI = _mm256_set1_epi32(2531011);
const static __m256i SEED_MASK = _mm256_set1_epi32(0x3F800000);
const static __m256 FLOAT_1 = _mm256_set1_ps(1.f);
const static __m256i INT32_1 = _mm256_set1_epi32(1);
inline __m256 rand_avx2(__m256i &seed)
{
__m256i vr0 = _mm256_mullo_epi32(seed, SEED_MUL);
__m256i vr1 = _mm256_add_epi32(vr0, SEED_ADDI);
__m256i vr2 = _mm256_srli_epi32(vr1, 9);
__m256i vr3 = _mm256_or_si256(vr2, SEED_MASK);
__m256 vr4 = _mm256_castsi256_ps(vr3);
__m256 result = _mm256_sub_ps(vr4, FLOAT_1);
seed = vr1;
return result;
}
double simulate(__m256i seedA, __m256i seedB)
{
const unsigned simulate_total = 250000000;
__m256i inside_count = _mm256_set1_epi32(0);
__m256i inside_count_opp = _mm256_set1_epi32(0);
for (unsigned i = 1; i < simulate_total / 8; i++) {
__m256 vecA = rand_avx2(seedA);
__m256 vecB = rand_avx2(seedB);
__m256 squareA = _mm256_mul_ps(vecA, vecA);
__m256 len = _mm256_fmadd_ps(vecB, vecB, squareA); // vecB * vecB + squareA
__m256 compMask = _mm256_cmp_ps(len, FLOAT_1, _CMP_LT_OS);
__m256i result = _mm256_and_si256(_mm256_castps_si256(compMask), INT32_1);
inside_count = _mm256_add_epi32(inside_count, result);
__m256 vecA_opp = _mm256_sub_ps(FLOAT_1, vecA);
__m256 vecB_opp = _mm256_sub_ps(FLOAT_1, vecB);
__m256 squareA_opp = _mm256_mul_ps(vecA_opp, vecA_opp);
__m256 len_opp = _mm256_fmadd_ps(vecB_opp, vecB_opp, squareA_opp); // vecB * vecB + squareA
__m256 compMask_opp = _mm256_cmp_ps(len_opp, FLOAT_1, _CMP_LT_OS);
__m256i result_opp = _mm256_and_si256(_mm256_castps_si256(compMask_opp), INT32_1);
inside_count_opp = _mm256_add_epi32(inside_count_opp, result_opp);
}
unsigned __int64 total = 0;
for (int i = 0; i < 8; ++i) {
total += inside_count.m256i_u32[i] + inside_count_opp.m256i_u32[i];
}
return total / double(simulate_total) * 2;
}
int main()
{
__m256i a;
_rdrand64_step(a.m256i_u64);
_rdrand64_step(a.m256i_u64 + 1);
_rdrand64_step(a.m256i_u64 + 2);
_rdrand64_step(a.m256i_u64 + 3);
__m256i b;
_rdrand64_step(b.m256i_u64);
_rdrand64_step(b.m256i_u64 + 1);
_rdrand64_step(b.m256i_u64 + 2);
_rdrand64_step(b.m256i_u64 + 3);
double result = simulate(a, b);
fprintf(stderr, "%f\n", result);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment