-
-
Save voidbert/72f437466fea9228f0ae58e05ec931a7 to your computer and use it in GitHub Desktop.
Calculate Pi with Monte Carlo and SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* I tried to calculate pi using Monte Carlo's method. It proved to be a very inefficient way to do it, but I loved | |
* the experience of writing SIMD intrinsics for the first time! | |
* | |
* This method is not ideal for various reasons: | |
* | |
* - Monte Carlo's method is, though technically not brute-force, slow. Trying a very large sample of points is a | |
* lengthier process than calculating a series up to a given number of terms, which will likely converge to pi | |
* faster than this method. | |
* | |
* - Using a linear congruential pseudo-random generator with a 32-bit seed limits the maximum number of samples to | |
* 2^32. In actuality, this value will be lower, as not all generated bits are used for generating the mantissa of | |
* the floating-point numbers actually used in samples. Increasing the generator's seed width / floating point | |
* precision would decrease the performance at least by half. | |
* | |
* How to compile (your architecture must support AVX): gcc -fopenmp -march=native -O3 -g pi.c | |
* | |
* Some benchmarks (on my i3-7100): | |
* | |
* - This program: 1.27s - 5 correct decimal places | |
* - ycruncher: 2.61s - 25 000 000 correct decimal places | |
* | |
* I'd say my program's digit rate doesn't seem very good. | |
*/ | |
#include <inttypes.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <immintrin.h> | |
#include <omp.h> | |
/* Linear congruential generator: generate a set of random integers from the previously generated vetor */ | |
inline __m256i random_next(__m256i current, /* Vector of 32-bit seeds */ | |
__m256i multiplier, /* Generator's multiplicative constant, broadcasted to all elements */ | |
__m256i increment, /* Generator's incremental constant, broadcasted to all elements */ | |
__m256i mask) /* Mask to be ANDed with the final result, before being returned */ | |
__attribute__((always_inline)); | |
inline __m256i random_next(__m256i current, __m256i multiplier, __m256i increment, __m256i mask) { | |
__m256i ret = current; | |
ret = _mm256_mullo_epi32(ret, multiplier); | |
ret = _mm256_add_epi32(ret, increment); | |
ret = _mm256_and_si256(ret, mask); | |
return ret; | |
} | |
/* Transform a vector of random integers into a vector of random floats from 0 to 1 */ | |
inline __m256 floatize(__m256i random, /* Random vector of 32-bit integers (from random_next) */ | |
__m256i mantissa_mask, /* A float with 1 in all mantissa digits, broadcasted to all vector elements */ | |
__m256i exponent, /* The bits of the exponent of numbers from 1 to 2, broadcasted to all vector elements */ | |
__m256 minus_one) /* -1.0f, broadcasted to all vector elements */ | |
__attribute__((always_inline)); | |
inline __m256 floatize(__m256i random, __m256i mantissa_mask, __m256i exponent, __m256 minus_one) { | |
__m256i ret = random; | |
ret = _mm256_and_si256(ret, mantissa_mask); | |
ret = _mm256_or_si256(ret, exponent); | |
return (__m256) _mm256_add_ps((__m256) ret, minus_one); | |
} | |
/* Calculates pi using the Monte Carlo Method. */ | |
double pi(size_t iter) { | |
/* Values from linear congruential generator in glibc */ | |
const __m256i random_multiplier = _mm256_set1_epi32(1103515245); | |
const __m256i random_increment = _mm256_set1_epi32(12345); | |
const __m256i random_mask = _mm256_set1_epi32(0x7fffffff); | |
/* Pre-computed vectors required by floatize */ | |
const __m256i floatize_mantissa_mask = _mm256_set1_epi32(0x007fffff); | |
const __m256i floatize_exponent = _mm256_set1_epi32(0x3f800000); | |
const __m256 floatize_minus_one = _mm256_set1_ps(-1.0f); | |
/* Needed for later comparisons */ | |
const __m256 one = _mm256_set1_ps(1.0f); | |
uint32_t compare_results[8] __attribute__((aligned(32))) = {0}; | |
/* Number of points inside the circle and iterator, respectively */ | |
uint64_t hits = 0; | |
#pragma omp parallel private(compare_results) reduction(+:hits) | |
{ | |
uint32_t initial_seeds[8] __attribute__((aligned(32))) = {1, 2, 3, 4, 5, 6, 7, 8}; | |
for (int i = 0; i < 8; ++i) | |
initial_seeds[i] += omp_get_thread_num(); | |
/* Initialize random integers values of x and y */ | |
__m256i randomx_epi32, randomy_epi32; | |
randomx_epi32 = _mm256_load_si256((__m256i const *) initial_seeds); | |
randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask); | |
randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask); | |
#pragma omp for | |
for (uint64_t i = 0; i < iter; ++i) { | |
/* Create the next vector of random numbers */ | |
randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask); | |
randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask); | |
/* Floating point values generated from the random integer vectors */ | |
__m256 randomx_sp, randomy_sp; | |
randomx_sp = floatize(randomx_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one); | |
randomy_sp = floatize(randomy_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one); | |
/* Calculate squared distances from origin (dist = x^2 + y^2) */ | |
__m256 dist; | |
dist = _mm256_mul_ps(randomx_sp, randomx_sp); | |
dist = _mm256_fmadd_ps(randomy_sp, randomy_sp, dist); | |
/* Compare distances to check if points are inside the unit circle. Count points sequentially, as vector POPCNT | |
* requires AVX-512, which I don't have (sad, I know) */ | |
dist = _mm256_cmp_ps(dist, one, _CMP_LE_OQ); | |
_mm256_store_ps((float *) compare_results, dist); | |
uint64_t loop_hits = 0; | |
#pragma GCC unroll 2 /* Tested and proved to be better unrolling */ | |
for (size_t j = 0; j < 8; ++j) | |
loop_hits = compare_results[j] ? (loop_hits + 1) : loop_hits; | |
hits += loop_hits; | |
} | |
} | |
return ((double) hits / ((double) iter * 8)) * 4.0; /* Note: 8 - points per iteration (in vector) */ | |
} | |
#define ITER_COUNT 0xfffffff /* The limits of the linear congruential generator being used (UINT_MAX / vector size). */ | |
int main(void) { | |
printf("%.*lf\n", 10, pi(ITER_COUNT)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment