Created
December 11, 2015 03:14
-
-
Save sakamoto-poteko/9d45efde2a1f728a6f6c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <immintrin.h> | |
const static __m256i SEED_MUL = _mm256_set1_epi32(214013); | |
const static __m256i SEED_ADDI = _mm256_set1_epi32(2531011); | |
const static __m256i SEED_MASK = _mm256_set1_epi32(0x3F800000); | |
const static __m256 FLOAT_1 = _mm256_set1_ps(1.f); | |
const static __m256i INT32_1 = _mm256_set1_epi32(1); | |
inline __m256 rand_avx2(__m256i &seed) | |
{ | |
__m256i vr0 = _mm256_mullo_epi32(seed, SEED_MUL); | |
__m256i vr1 = _mm256_add_epi32(vr0, SEED_ADDI); | |
__m256i vr2 = _mm256_srli_epi32(vr1, 9); | |
__m256i vr3 = _mm256_or_si256(vr2, SEED_MASK); | |
__m256 vr4 = _mm256_castsi256_ps(vr3); | |
__m256 result = _mm256_sub_ps(vr4, FLOAT_1); | |
seed = vr1; | |
return result; | |
} | |
double simulate(__m256i seedA, __m256i seedB) | |
{ | |
const unsigned simulate_total = 250000000; | |
__m256i inside_count = _mm256_set1_epi32(0); | |
__m256i inside_count_opp = _mm256_set1_epi32(0); | |
for (unsigned i = 1; i < simulate_total / 8; i++) { | |
__m256 vecA = rand_avx2(seedA); | |
__m256 vecB = rand_avx2(seedB); | |
__m256 squareA = _mm256_mul_ps(vecA, vecA); | |
__m256 len = _mm256_fmadd_ps(vecB, vecB, squareA); // vecB * vecB + squareA | |
__m256 compMask = _mm256_cmp_ps(len, FLOAT_1, _CMP_LT_OS); | |
__m256i result = _mm256_and_si256(_mm256_castps_si256(compMask), INT32_1); | |
inside_count = _mm256_add_epi32(inside_count, result); | |
__m256 vecA_opp = _mm256_sub_ps(FLOAT_1, vecA); | |
__m256 vecB_opp = _mm256_sub_ps(FLOAT_1, vecB); | |
__m256 squareA_opp = _mm256_mul_ps(vecA_opp, vecA_opp); | |
__m256 len_opp = _mm256_fmadd_ps(vecB_opp, vecB_opp, squareA_opp); // vecB * vecB + squareA | |
__m256 compMask_opp = _mm256_cmp_ps(len_opp, FLOAT_1, _CMP_LT_OS); | |
__m256i result_opp = _mm256_and_si256(_mm256_castps_si256(compMask_opp), INT32_1); | |
inside_count_opp = _mm256_add_epi32(inside_count_opp, result_opp); | |
} | |
unsigned __int64 total = 0; | |
for (int i = 0; i < 8; ++i) { | |
total += inside_count.m256i_u32[i] + inside_count_opp.m256i_u32[i]; | |
} | |
return total / double(simulate_total) * 2; | |
} | |
int main() | |
{ | |
__m256i a; | |
_rdrand64_step(a.m256i_u64); | |
_rdrand64_step(a.m256i_u64 + 1); | |
_rdrand64_step(a.m256i_u64 + 2); | |
_rdrand64_step(a.m256i_u64 + 3); | |
__m256i b; | |
_rdrand64_step(b.m256i_u64); | |
_rdrand64_step(b.m256i_u64 + 1); | |
_rdrand64_step(b.m256i_u64 + 2); | |
_rdrand64_step(b.m256i_u64 + 3); | |
double result = simulate(a, b); | |
fprintf(stderr, "%f\n", result); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment