Skip to content

Instantly share code, notes, and snippets.

@voidbert
Last active November 27, 2023 11:50
Show Gist options
  • Save voidbert/72f437466fea9228f0ae58e05ec931a7 to your computer and use it in GitHub Desktop.
Save voidbert/72f437466fea9228f0ae58e05ec931a7 to your computer and use it in GitHub Desktop.
Calculate Pi with Monte Carlo and SIMD
/**
* I tried to calculate pi using Monte Carlo's method. It proved to be a very inefficient way to do it, but I loved
* the experience of writing SIMD intrinsics for the first time!
*
* This method is not ideal for various reasons:
*
* - Monte Carlo's method is, though technically not brute-force, slow. Trying a very large sample of points is a
* lengthier process than calculating a series up to a given number of terms, which will likely converge to pi
* faster than this method.
*
* - Using a linear congruential pseudo-random generator with a 32-bit seed limits the maximum number of samples to
* 2^32. In actuality, this value will be lower, as not all generated bits are used for generating the mantissa of
* the floating-point numbers actually used in samples. Increasing the generator's seed width / floating point
* precision would decrease the performance at least by half.
*
* How to compile (your architecture must support AVX): gcc -fopenmp -march=native -O3 -g pi.c
*
* Some benchmarks (on my i3-7100):
*
* - This program: 1.27s - 5 correct decimal places
* - ycruncher: 2.61s - 25 000 000 correct decimal places
*
* I'd say my program's digit rate doesn't seem very good.
*/
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <omp.h>
/* Linear congruential generator: generate a set of random integers from the previously generated vetor */
inline __m256i random_next(__m256i current, /* Vector of 32-bit seeds */
__m256i multiplier, /* Generator's multiplicative constant, broadcasted to all elements */
__m256i increment, /* Generator's incremental constant, broadcasted to all elements */
__m256i mask) /* Mask to be ANDed with the final result, before being returned */
__attribute__((always_inline));
inline __m256i random_next(__m256i current, __m256i multiplier, __m256i increment, __m256i mask) {
__m256i ret = current;
ret = _mm256_mullo_epi32(ret, multiplier);
ret = _mm256_add_epi32(ret, increment);
ret = _mm256_and_si256(ret, mask);
return ret;
}
/* Transform a vector of random integers into a vector of random floats from 0 to 1 */
inline __m256 floatize(__m256i random, /* Random vector of 32-bit integers (from random_next) */
__m256i mantissa_mask, /* A float with 1 in all mantissa digits, broadcasted to all vector elements */
__m256i exponent, /* The bits of the exponent of numbers from 1 to 2, broadcasted to all vector elements */
__m256 minus_one) /* -1.0f, broadcasted to all vector elements */
__attribute__((always_inline));
inline __m256 floatize(__m256i random, __m256i mantissa_mask, __m256i exponent, __m256 minus_one) {
__m256i ret = random;
ret = _mm256_and_si256(ret, mantissa_mask);
ret = _mm256_or_si256(ret, exponent);
return (__m256) _mm256_add_ps((__m256) ret, minus_one);
}
/* Calculates pi using the Monte Carlo Method. */
double pi(size_t iter) {
/* Values from linear congruential generator in glibc */
const __m256i random_multiplier = _mm256_set1_epi32(1103515245);
const __m256i random_increment = _mm256_set1_epi32(12345);
const __m256i random_mask = _mm256_set1_epi32(0x7fffffff);
/* Pre-computed vectors required by floatize */
const __m256i floatize_mantissa_mask = _mm256_set1_epi32(0x007fffff);
const __m256i floatize_exponent = _mm256_set1_epi32(0x3f800000);
const __m256 floatize_minus_one = _mm256_set1_ps(-1.0f);
/* Needed for later comparisons */
const __m256 one = _mm256_set1_ps(1.0f);
uint32_t compare_results[8] __attribute__((aligned(32))) = {0};
/* Number of points inside the circle and iterator, respectively */
uint64_t hits = 0;
#pragma omp parallel private(compare_results) reduction(+:hits)
{
uint32_t initial_seeds[8] __attribute__((aligned(32))) = {1, 2, 3, 4, 5, 6, 7, 8};
for (int i = 0; i < 8; ++i)
initial_seeds[i] += omp_get_thread_num();
/* Initialize random integers values of x and y */
__m256i randomx_epi32, randomy_epi32;
randomx_epi32 = _mm256_load_si256((__m256i const *) initial_seeds);
randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
#pragma omp for
for (uint64_t i = 0; i < iter; ++i) {
/* Create the next vector of random numbers */
randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
/* Floating point values generated from the random integer vectors */
__m256 randomx_sp, randomy_sp;
randomx_sp = floatize(randomx_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);
randomy_sp = floatize(randomy_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);
/* Calculate squared distances from origin (dist = x^2 + y^2) */
__m256 dist;
dist = _mm256_mul_ps(randomx_sp, randomx_sp);
dist = _mm256_fmadd_ps(randomy_sp, randomy_sp, dist);
/* Compare distances to check if points are inside the unit circle. Count points sequentially, as vector POPCNT
* requires AVX-512, which I don't have (sad, I know) */
dist = _mm256_cmp_ps(dist, one, _CMP_LE_OQ);
_mm256_store_ps((float *) compare_results, dist);
uint64_t loop_hits = 0;
#pragma GCC unroll 2 /* Tested and proved to be better unrolling */
for (size_t j = 0; j < 8; ++j)
loop_hits = compare_results[j] ? (loop_hits + 1) : loop_hits;
hits += loop_hits;
}
}
return ((double) hits / ((double) iter * 8)) * 4.0; /* Note: 8 - points per iteration (in vector) */
}
#define ITER_COUNT 0xfffffff /* The limits of the linear congruential generator being used (UINT_MAX / vector size). */
int main(void) {
printf("%.*lf\n", 10, pi(ITER_COUNT));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment