voidbert/pi.c

## pi.c
/**
 * I tried to calculate pi using Monte Carlo's method. It proved to be a very inefficient way to do it, but I loved
 * the experience of writing SIMD intrinsics for the first time!
 *
 * This method is not ideal for various reasons:
 *
 *  - Monte Carlo's method is, though technically not brute-force, slow. Trying a very large sample of points is a
 *    lengthier process than calculating a series up to a given number of terms, which will likely converge to pi
 *    faster than this method.
 *
 *  - Using a linear congruential pseudo-random generator with a 32-bit seed limits the maximum number of samples to
 *    2^32. In actuality, this value will be lower, as not all generated bits are used for generating the mantissa of
 *    the floating-point numbers actually used in samples. Increasing the generator's seed width / floating point
 *    precision would decrease the performance at least by half.
 *
 * How to compile (your architecture must support AVX): gcc -fopenmp -march=native -O3 -g pi.c
 *
 * Some benchmarks (on my i3-7100):
 *
 *  - This program: 1.27s - 5 correct decimal places
 *  - ycruncher:    2.61s - 25 000 000 correct decimal places
 *
 * I'd say my program's digit rate doesn't seem very good.
 */

#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>

#include <immintrin.h>
#include <omp.h>

/* Linear congruential generator: generate a set of random integers from the previously generated vetor */
inline __m256i random_next(__m256i current,    /* Vector of 32-bit seeds */
                           __m256i multiplier, /* Generator's multiplicative constant, broadcasted to all elements */
                           __m256i increment,  /* Generator's incremental constant, broadcasted to all elements */
                           __m256i mask)       /* Mask to be ANDed with the final result, before being returned */
                           __attribute__((always_inline));

inline __m256i random_next(__m256i current, __m256i multiplier, __m256i increment, __m256i mask) {
    __m256i ret = current;

    ret = _mm256_mullo_epi32(ret, multiplier);
    ret = _mm256_add_epi32(ret, increment);
    ret = _mm256_and_si256(ret, mask);

    return ret;
}

/* Transform a vector of random integers into a vector of random floats from 0 to 1 */
inline __m256 floatize(__m256i random,        /* Random vector of 32-bit integers (from random_next) */
                       __m256i mantissa_mask, /* A float with 1 in all mantissa digits, broadcasted to all vector elements */
                       __m256i exponent,      /* The bits of the exponent of numbers from 1 to 2, broadcasted to all vector elements */
                       __m256 minus_one)      /* -1.0f, broadcasted to all vector elements */
                       __attribute__((always_inline));

inline __m256 floatize(__m256i random, __m256i mantissa_mask, __m256i exponent, __m256 minus_one) {
    __m256i ret = random;

    ret = _mm256_and_si256(ret, mantissa_mask);
    ret = _mm256_or_si256(ret, exponent);

    return (__m256) _mm256_add_ps((__m256) ret, minus_one);
}

/* Calculates pi using the Monte Carlo Method. */
double pi(size_t iter) {
    /* Values from linear congruential generator in glibc */
    const __m256i random_multiplier = _mm256_set1_epi32(1103515245);
    const __m256i random_increment  = _mm256_set1_epi32(12345);
    const __m256i random_mask       = _mm256_set1_epi32(0x7fffffff);

    /* Pre-computed vectors required by floatize */
    const __m256i floatize_mantissa_mask  = _mm256_set1_epi32(0x007fffff);
    const __m256i floatize_exponent       = _mm256_set1_epi32(0x3f800000);
    const __m256  floatize_minus_one      = _mm256_set1_ps(-1.0f);

    /* Needed for later comparisons */
    const __m256 one = _mm256_set1_ps(1.0f);
    uint32_t compare_results[8] __attribute__((aligned(32))) = {0};

    /* Number of points inside the circle and iterator, respectively */
    uint64_t hits = 0;

    #pragma omp parallel private(compare_results) reduction(+:hits)
    {
        uint32_t initial_seeds[8] __attribute__((aligned(32))) = {1, 2, 3, 4, 5, 6, 7, 8};
        for (int i = 0; i < 8; ++i)
                initial_seeds[i] += omp_get_thread_num();

        /* Initialize random integers values of x and y */
        __m256i randomx_epi32, randomy_epi32;
        randomx_epi32 = _mm256_load_si256((__m256i const *) initial_seeds);
        randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
        randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);

        #pragma omp for
        for (uint64_t i = 0; i < iter; ++i) {
            /* Create the next vector of random numbers */
            randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
            randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);

            /* Floating point values generated from the random integer vectors */
            __m256 randomx_sp, randomy_sp;
            randomx_sp = floatize(randomx_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);
            randomy_sp = floatize(randomy_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);

            /* Calculate squared distances from origin (dist = x^2 + y^2) */
            __m256 dist;
            dist = _mm256_mul_ps(randomx_sp, randomx_sp);
            dist = _mm256_fmadd_ps(randomy_sp, randomy_sp, dist);

            /* Compare distances to check if points are inside the unit circle. Count points sequentially, as vector POPCNT
             * requires AVX-512, which I don't have (sad, I know) */
            dist = _mm256_cmp_ps(dist, one, _CMP_LE_OQ);
            _mm256_store_ps((float *) compare_results, dist);

            uint64_t loop_hits = 0;
            #pragma GCC unroll 2 /* Tested and proved to be better unrolling */
            for (size_t j = 0; j < 8; ++j)
                loop_hits = compare_results[j] ? (loop_hits + 1) : loop_hits;
            hits += loop_hits;
        }
    }

    return ((double) hits / ((double) iter * 8)) * 4.0; /* Note: 8 - points per iteration (in vector) */
}

#define ITER_COUNT 0xfffffff /* The limits of the linear congruential generator being used (UINT_MAX / vector size). */

int main(void) {
    printf("%.*lf\n", 10, pi(ITER_COUNT));
}
	/**
	* I tried to calculate pi using Monte Carlo's method. It proved to be a very inefficient way to do it, but I loved
	* the experience of writing SIMD intrinsics for the first time!
	*
	* This method is not ideal for various reasons:
	*
	* - Monte Carlo's method is, though technically not brute-force, slow. Trying a very large sample of points is a
	* lengthier process than calculating a series up to a given number of terms, which will likely converge to pi
	* faster than this method.
	*
	* - Using a linear congruential pseudo-random generator with a 32-bit seed limits the maximum number of samples to
	* 2^32. In actuality, this value will be lower, as not all generated bits are used for generating the mantissa of
	* the floating-point numbers actually used in samples. Increasing the generator's seed width / floating point
	* precision would decrease the performance at least by half.
	*
	* How to compile (your architecture must support AVX): gcc -fopenmp -march=native -O3 -g pi.c
	*
	* Some benchmarks (on my i3-7100):
	*
	* - This program: 1.27s - 5 correct decimal places
	* - ycruncher: 2.61s - 25 000 000 correct decimal places
	*
	* I'd say my program's digit rate doesn't seem very good.
	*/

	#include <inttypes.h>
	#include <stdio.h>
	#include <stdlib.h>

	#include <immintrin.h>
	#include <omp.h>

	/* Linear congruential generator: generate a set of random integers from the previously generated vetor */
	inline __m256i random_next(__m256i current, /* Vector of 32-bit seeds */
	__m256i multiplier, /* Generator's multiplicative constant, broadcasted to all elements */
	__m256i increment, /* Generator's incremental constant, broadcasted to all elements */
	__m256i mask) /* Mask to be ANDed with the final result, before being returned */
	__attribute__((always_inline));

	inline __m256i random_next(__m256i current, __m256i multiplier, __m256i increment, __m256i mask) {
	__m256i ret = current;

	ret = _mm256_mullo_epi32(ret, multiplier);
	ret = _mm256_add_epi32(ret, increment);
	ret = _mm256_and_si256(ret, mask);

	return ret;
	}

	/* Transform a vector of random integers into a vector of random floats from 0 to 1 */
	inline __m256 floatize(__m256i random, /* Random vector of 32-bit integers (from random_next) */
	__m256i mantissa_mask, /* A float with 1 in all mantissa digits, broadcasted to all vector elements */
	__m256i exponent, /* The bits of the exponent of numbers from 1 to 2, broadcasted to all vector elements */
	__m256 minus_one) /* -1.0f, broadcasted to all vector elements */
	__attribute__((always_inline));

	inline __m256 floatize(__m256i random, __m256i mantissa_mask, __m256i exponent, __m256 minus_one) {
	__m256i ret = random;

	ret = _mm256_and_si256(ret, mantissa_mask);
	ret = _mm256_or_si256(ret, exponent);

	return (__m256) _mm256_add_ps((__m256) ret, minus_one);
	}

	/* Calculates pi using the Monte Carlo Method. */
	double pi(size_t iter) {
	/* Values from linear congruential generator in glibc */
	const __m256i random_multiplier = _mm256_set1_epi32(1103515245);
	const __m256i random_increment = _mm256_set1_epi32(12345);
	const __m256i random_mask = _mm256_set1_epi32(0x7fffffff);

	/* Pre-computed vectors required by floatize */
	const __m256i floatize_mantissa_mask = _mm256_set1_epi32(0x007fffff);
	const __m256i floatize_exponent = _mm256_set1_epi32(0x3f800000);
	const __m256 floatize_minus_one = _mm256_set1_ps(-1.0f);

	/* Needed for later comparisons */
	const __m256 one = _mm256_set1_ps(1.0f);
	uint32_t compare_results[8] __attribute__((aligned(32))) = {0};

	/* Number of points inside the circle and iterator, respectively */
	uint64_t hits = 0;

	#pragma omp parallel private(compare_results) reduction(+:hits)
	{
	uint32_t initial_seeds[8] __attribute__((aligned(32))) = {1, 2, 3, 4, 5, 6, 7, 8};
	for (int i = 0; i < 8; ++i)
	initial_seeds[i] += omp_get_thread_num();

	/* Initialize random integers values of x and y */
	__m256i randomx_epi32, randomy_epi32;
	randomx_epi32 = _mm256_load_si256((__m256i const *) initial_seeds);
	randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
	randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);

	#pragma omp for
	for (uint64_t i = 0; i < iter; ++i) {
	/* Create the next vector of random numbers */
	randomx_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);
	randomy_epi32 = random_next(randomx_epi32, random_multiplier, random_increment, random_mask);

	/* Floating point values generated from the random integer vectors */
	__m256 randomx_sp, randomy_sp;
	randomx_sp = floatize(randomx_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);
	randomy_sp = floatize(randomy_epi32, floatize_mantissa_mask, floatize_exponent, floatize_minus_one);

	/* Calculate squared distances from origin (dist = x^2 + y^2) */
	__m256 dist;
	dist = _mm256_mul_ps(randomx_sp, randomx_sp);
	dist = _mm256_fmadd_ps(randomy_sp, randomy_sp, dist);

	/* Compare distances to check if points are inside the unit circle. Count points sequentially, as vector POPCNT
	* requires AVX-512, which I don't have (sad, I know) */
	dist = _mm256_cmp_ps(dist, one, _CMP_LE_OQ);
	_mm256_store_ps((float *) compare_results, dist);

	uint64_t loop_hits = 0;
	#pragma GCC unroll 2 /* Tested and proved to be better unrolling */
	for (size_t j = 0; j < 8; ++j)
	loop_hits = compare_results[j] ? (loop_hits + 1) : loop_hits;
	hits += loop_hits;
	}
	}

	return ((double) hits / ((double) iter * 8)) * 4.0; /* Note: 8 - points per iteration (in vector) */
	}

	#define ITER_COUNT 0xfffffff /* The limits of the linear congruential generator being used (UINT_MAX / vector size). */

	int main(void) {
	printf("%.*lf\n", 10, pi(ITER_COUNT));
	}