bisqwit/simdmandel.hh

## simdmandel.hh
/**
 * Mandelbrot fractal portable SIMD calculation core
 * Copyright (c) 2017 Joel Yliluoma (http://iki.fi/bisqwit/)
 */
#include <array>
#include <cmath>
/*
 * SIMDMandelCalculator template:
 *
 * T is the underlying (real) numeric type, with which the calculations
 * are peformed. It is ideally float or double.
 * This type directly controls the maximum zoom depth in the fractal.
 * Ideally we would use fixed-point, because only the number of bits in the mantissa
 * matter, and with 64-bit integers we can have almost 63 bits of mantissa,
 * while with a 64-bit IEEE double we only get 52 bits of mantissa.
 * However, currently this engine only supports floating pointer numbers.
 *
 * N is the number of mandelbrot coordinates to calculate in parallel (vector size).
 * Ideally N should be a small integer multiple of the largest native vector size for T.
 * Some options:
 *                MMX SSE SSE2 AVX AVX512 Altivec Neon
 *   ------------ --- --- ---- --- ------ ------- ----
 *   float16      *   *   *    *   *      *       8
 *   float        2   4   4    8   16     4       4
 *   double       1   1   2    4   8      1       2
 *   long double  1   1   1    1   1      1       1
 *
 * If you use GCC or Clang, compilation with -march=native
 * and either -O3, -Ofast or -ftree-vectorize is suggested.
 * -fopenmp may also help (#pragma omp simd tells the compiler to try harder at vectorizing).
 *
 * If do_smooth is enabled, points that are outside the set are given
 * a smooth (floating point) iteration count rather than a discrete
 * (integral) iteration count. This incurs a slight performance penalty.
 * If you use this, you should also set iter_mul to e.g. 4096.
 *
 * If do_periodicity_checking is enabled, light-weight periodicity checking
 * is enabled to improve the performance significantly for points that are
 * inside the set. It has a slight performance penalty for points that are outside the set.
 *
 * If do_logarithmic_iteration_counts is enabled, a log() function is applied
 * to the returned iteration counts. You should also set iter_mul to e.g. 4096.
 *
 * The returned iteration counts (float) are multiplied by iter_mul before
 * they are converted into integers and returned (fixed-point).
 */
template<typename T, unsigned N,
         bool do_smooth = true                        /* S */,
         bool do_periodicity_checking = true          /* P */,
         bool do_logarithmic_iteration_counts = false /* L */,
         unsigned iter_mul = 1                        /* I */>
struct SIMDMandelCalculator
{
    static constexpr T color_scale           = 8;
    static constexpr T escape_radius_squared = do_smooth ? 6*6 : 4;

    typedef std::array<T,   N> dvec;
    typedef std::array<int, N> bvec;

    dvec x,y;           // The "c", complex coordinate we are iterating
    dvec r,i;           // The "z", temporary complex number
    dvec dist;          // Last recorded squared distance before iteration was stopped
    //                  // If do_smooth is false, it continues being updated
    //                  // even then, for a moderate improvement in performance.
    dvec perr,peri;     // A recorded sample of "z" used in periodicity checking
    bvec escaped;       // Flag used to stop iterating (indicates whether an escape happened)
    bvec iter;          // The iteration count
    bvec maxiter;       // Maximum iteration count

    /* Constructor: Initialize calculation for the specified number of maximum iterations, at given coordinates */
    SIMDMandelCalculator(unsigned MaxIter, const T* cr, const T* ci)
    {
        #pragma omp simd
        for(unsigned n=0; n<N; ++n)
        {
            x[n]       = cr[n];
            y[n]       = ci[n];
            r[n]       = cr[n];
            i[n]       = ci[n];
            maxiter[n] = MaxIter;
            iter[n]    = MaxIter;
            perr[n]    = 0;
            peri[n]    = 0;
            dist[n]    = 0;
            escaped[n] = 0;
        }
    }

    /* End(): Returns true if all N coordinates have escaped or MaxIter iterations has been reached */
    bool End() const
    {
        typename bvec::value_type n_escaped = 0;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) n_escaped += escaped[n];
        return n_escaped >= typename bvec::value_type(N);
    }

    /* Round(): Performs one iteration for all N coordinates */
    void Round()
    {
        dvec r2, i2, ri;
        /* Note: GCC generates slightly better performing code, when this code is
         * in a single loop than when the statements are each in separate loops.
         * However, if one of the statements fails to be vectorized, the entire loop
         * will be de-vectorized. If that happens (inspect the code with gcc -S!),
         * try splitting the loop into separate consecutive loops.
         */
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) { r2[n] = (r[n] * r[n]);
                                      i2[n] = (i[n] * i[n]);
                                      if(!do_smooth || !escaped[n]) { dist[n] = r2[n] + i2[n]; }
                                      escaped[n] = escaped[n] || !iter[n] || (dist[n] >= escape_radius_squared);
                                      iter[n] -= !escaped[n];
                                      ri[n] = (r[n] * i[n]);
                                      i[n] = y[n] + (ri[n] * 2);
                                      r[n] = x[n] + (r2[n] - i2[n]); }

        /* Periodicity checking produces a significant speedup for coordinates
         * that are inside the set (especially if they go into a cycle early on),
         * but at a cost of a slight performance penalty for coordinates that are not.
         */
        if(do_periodicity_checking)
        {
            bvec moment;
            /* When the number of remaining iterations is a power of two,
             * save the current complex number (r & i).
             * Otherwise, compare it to the saved number. If they match, we are inside the set,
             * and can set the iteration count to max immediately.
             */
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) { int tmp = iter[n]; moment[n] = tmp & (tmp-1); }
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) { if(!moment[n])                              { perr[n] = r[n]; peri[n] = i[n]; }
                                          else if(r[n] == perr[n] && i[n] == peri[n]) { iter[n] = 0; } }
        }
    }

    /* Get(): Get the translated iteration counts for all N coordinates according
     *        to the current progress (call after End()==true, but can be called before) */
    std::array<int,N> Get() const
    {
        std::array<int,N> out;
        if(!do_logarithmic_iteration_counts && iter_mul == 1)
        {
            #pragma omp simd
            for(unsigned n=0; n<N; ++n) out[n] = maxiter[n] - iter[n];
            return out;
        }
        dvec iter_f;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) iter_f[n] = maxiter[n] - iter[n];
        #pragma omp simd
        for(unsigned n=0; n<N; ++n)
            if(do_smooth && iter[n] != 0)
                iter_f[n] += 1 - std::log2(std::log2(dist[n]) / T(2));
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) iter_f[n] = (do_logarithmic_iteration_counts ? std::log(iter_f[n]) : iter_f[n]) * color_scale;
        #pragma omp simd
        for(unsigned n=0; n<N; ++n) out[n] = iter_f[n] * iter_mul;
        return out;
    }
};

/* Utility example: Calculate for N coordinates, return iteration counts as array */
template<unsigned N, bool S,bool P=true,bool L=true,unsigned I, typename T>
std::array<int,N> Iterate(unsigned MaxIter, const T* x, const T* y)
{
    SIMDMandelCalculator<T,N,  S,P,L,I> calc(MaxIter, x,y);
    while(!calc.End()) { calc.Round(); }
    return calc.Get();
}

/* Utility example: Calculate for N coordinates passed as std::array, return iteration counts as array */
template<bool S,bool P=true,bool L=true,unsigned I, typename T,std::size_t N>
std::array<int,N> Iterate(unsigned MaxIter, const std::array<T,N>& x, const std::array<T,N>& y)
{
    return Iterate<N, S,P,L,I>(MaxIter, &x[0], &y[0]);
}

/* Utility example: Calculate for N coordinates where N is not a compile-time constant */
template<bool S,bool P=true,bool L=true,unsigned I, typename T>
void Iterate(unsigned MaxIter, const T* x, const T* y, int* out, unsigned N)
{
    while(N >= 16)
    {
        auto tmp = Iterate<16, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n<16; ++n) out[n] = tmp[n];
        x += 16; y += 16; out += 16; N -= 16;
    }
    if(N == 8)
    {
        auto tmp = Iterate< 8, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 8; ++n) out[n] = tmp[n];
        return;
    }
    while(N >= 4)
    {
        auto tmp = Iterate< 4, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 4; ++n) out[n] = tmp[n];
        x += 4; y += 4; out += 4; N -= 4;
    }
    while(N >= 1)
    {
        auto tmp = Iterate< 1, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 1; ++n) out[n] = tmp[n];
        x += 1; y += 1; out += 1; N -= 1;
    }
}
	/**
	* Mandelbrot fractal portable SIMD calculation core
	* Copyright (c) 2017 Joel Yliluoma (http://iki.fi/bisqwit/)
	*/
	#include <array>
	#include <cmath>
	/*
	* SIMDMandelCalculator template:
	*
	* T is the underlying (real) numeric type, with which the calculations
	* are peformed. It is ideally float or double.
	* This type directly controls the maximum zoom depth in the fractal.
	* Ideally we would use fixed-point, because only the number of bits in the mantissa
	* matter, and with 64-bit integers we can have almost 63 bits of mantissa,
	* while with a 64-bit IEEE double we only get 52 bits of mantissa.
	* However, currently this engine only supports floating pointer numbers.
	*
	* N is the number of mandelbrot coordinates to calculate in parallel (vector size).
	* Ideally N should be a small integer multiple of the largest native vector size for T.
	* Some options:
	* MMX SSE SSE2 AVX AVX512 Altivec Neon
	* ------------ --- --- ---- --- ------ ------- ----
	* float16 * * * * * * 8
	* float 2 4 4 8 16 4 4
	* double 1 1 2 4 8 1 2
	* long double 1 1 1 1 1 1 1
	*
	* If you use GCC or Clang, compilation with -march=native
	* and either -O3, -Ofast or -ftree-vectorize is suggested.
	* -fopenmp may also help (#pragma omp simd tells the compiler to try harder at vectorizing).
	*
	* If do_smooth is enabled, points that are outside the set are given
	* a smooth (floating point) iteration count rather than a discrete
	* (integral) iteration count. This incurs a slight performance penalty.
	* If you use this, you should also set iter_mul to e.g. 4096.
	*
	* If do_periodicity_checking is enabled, light-weight periodicity checking
	* is enabled to improve the performance significantly for points that are
	* inside the set. It has a slight performance penalty for points that are outside the set.
	*
	* If do_logarithmic_iteration_counts is enabled, a log() function is applied
	* to the returned iteration counts. You should also set iter_mul to e.g. 4096.
	*
	* The returned iteration counts (float) are multiplied by iter_mul before
	* they are converted into integers and returned (fixed-point).
	*/
	template<typename T, unsigned N,
	bool do_smooth = true /* S */,
	bool do_periodicity_checking = true /* P */,
	bool do_logarithmic_iteration_counts = false /* L */,
	unsigned iter_mul = 1 /* I */>
	struct SIMDMandelCalculator
	{
	static constexpr T color_scale = 8;
	static constexpr T escape_radius_squared = do_smooth ? 6*6 : 4;

	typedef std::array<T, N> dvec;
	typedef std::array<int, N> bvec;

	dvec x,y; // The "c", complex coordinate we are iterating
	dvec r,i; // The "z", temporary complex number
	dvec dist; // Last recorded squared distance before iteration was stopped
	// // If do_smooth is false, it continues being updated
	// // even then, for a moderate improvement in performance.
	dvec perr,peri; // A recorded sample of "z" used in periodicity checking
	bvec escaped; // Flag used to stop iterating (indicates whether an escape happened)
	bvec iter; // The iteration count
	bvec maxiter; // Maximum iteration count

	/* Constructor: Initialize calculation for the specified number of maximum iterations, at given coordinates */
	SIMDMandelCalculator(unsigned MaxIter, const T* cr, const T* ci)
	{
	#pragma omp simd
	for(unsigned n=0; n<N; ++n)
	{
	x[n] = cr[n];
	y[n] = ci[n];
	r[n] = cr[n];
	i[n] = ci[n];
	maxiter[n] = MaxIter;
	iter[n] = MaxIter;
	perr[n] = 0;
	peri[n] = 0;
	dist[n] = 0;
	escaped[n] = 0;
	}
	}

	/* End(): Returns true if all N coordinates have escaped or MaxIter iterations has been reached */
	bool End() const
	{
	typename bvec::value_type n_escaped = 0;
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) n_escaped += escaped[n];
	return n_escaped >= typename bvec::value_type(N);
	}

	/* Round(): Performs one iteration for all N coordinates */
	void Round()
	{
	dvec r2, i2, ri;
	/* Note: GCC generates slightly better performing code, when this code is
	* in a single loop than when the statements are each in separate loops.
	* However, if one of the statements fails to be vectorized, the entire loop
	* will be de-vectorized. If that happens (inspect the code with gcc -S!),
	* try splitting the loop into separate consecutive loops.
	*/
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) { r2[n] = (r[n] * r[n]);
	i2[n] = (i[n] * i[n]);
	if(!do_smooth \|\| !escaped[n]) { dist[n] = r2[n] + i2[n]; }
	escaped[n] = escaped[n] \|\| !iter[n] \|\| (dist[n] >= escape_radius_squared);
	iter[n] -= !escaped[n];
	ri[n] = (r[n] * i[n]);
	i[n] = y[n] + (ri[n] * 2);
	r[n] = x[n] + (r2[n] - i2[n]); }

	/* Periodicity checking produces a significant speedup for coordinates
	* that are inside the set (especially if they go into a cycle early on),
	* but at a cost of a slight performance penalty for coordinates that are not.
	*/
	if(do_periodicity_checking)
	{
	bvec moment;
	/* When the number of remaining iterations is a power of two,
	* save the current complex number (r & i).
	* Otherwise, compare it to the saved number. If they match, we are inside the set,
	* and can set the iteration count to max immediately.
	*/
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) { int tmp = iter[n]; moment[n] = tmp & (tmp-1); }
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) { if(!moment[n]) { perr[n] = r[n]; peri[n] = i[n]; }
	else if(r[n] == perr[n] && i[n] == peri[n]) { iter[n] = 0; } }
	}
	}

	/* Get(): Get the translated iteration counts for all N coordinates according
	* to the current progress (call after End()==true, but can be called before) */
	std::array<int,N> Get() const
	{
	std::array<int,N> out;
	if(!do_logarithmic_iteration_counts && iter_mul == 1)
	{
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) out[n] = maxiter[n] - iter[n];
	return out;
	}
	dvec iter_f;
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) iter_f[n] = maxiter[n] - iter[n];
	#pragma omp simd
	for(unsigned n=0; n<N; ++n)
	if(do_smooth && iter[n] != 0)
	iter_f[n] += 1 - std::log2(std::log2(dist[n]) / T(2));
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) iter_f[n] = (do_logarithmic_iteration_counts ? std::log(iter_f[n]) : iter_f[n]) * color_scale;
	#pragma omp simd
	for(unsigned n=0; n<N; ++n) out[n] = iter_f[n] * iter_mul;
	return out;
	}
	};

	/* Utility example: Calculate for N coordinates, return iteration counts as array */
	template<unsigned N, bool S,bool P=true,bool L=true,unsigned I, typename T>
	std::array<int,N> Iterate(unsigned MaxIter, const T* x, const T* y)
	{
	SIMDMandelCalculator<T,N, S,P,L,I> calc(MaxIter, x,y);
	while(!calc.End()) { calc.Round(); }
	return calc.Get();
	}

	/* Utility example: Calculate for N coordinates passed as std::array, return iteration counts as array */
	template<bool S,bool P=true,bool L=true,unsigned I, typename T,std::size_t N>
	std::array<int,N> Iterate(unsigned MaxIter, const std::array<T,N>& x, const std::array<T,N>& y)
	{
	return Iterate<N, S,P,L,I>(MaxIter, &x[0], &y[0]);
	}

	/* Utility example: Calculate for N coordinates where N is not a compile-time constant */
	template<bool S,bool P=true,bool L=true,unsigned I, typename T>
	void Iterate(unsigned MaxIter, const T* x, const T* y, int* out, unsigned N)
	{
	while(N >= 16)
	{
	auto tmp = Iterate<16, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n<16; ++n) out[n] = tmp[n];
	x += 16; y += 16; out += 16; N -= 16;
	}
	if(N == 8)
	{
	auto tmp = Iterate< 8, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 8; ++n) out[n] = tmp[n];
	return;
	}
	while(N >= 4)
	{
	auto tmp = Iterate< 4, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 4; ++n) out[n] = tmp[n];
	x += 4; y += 4; out += 4; N -= 4;
	}
	while(N >= 1)
	{
	auto tmp = Iterate< 1, S,P,L,I>(MaxIter, x,y); for(unsigned n=0; n< 1; ++n) out[n] = tmp[n];
	x += 1; y += 1; out += 1; N -= 1;
	}
	}