Alexis-D/Makefile

## README.md

      
    Raw
  

              README.md
            
          
    Mandelbrot Set

Parallel generation of the Mandelbrot set with OpenMP, SSE and various small/ugly/hacky optimizations.
Perfs?

It's pretty fast on a 64 cores machines ~1sec for 40 700x700 frames. It could be faster with mathematical optimizations but the goal here was just to get "code" optimizations...
Compile

I'm not the author of Screen.{cpp,h} and there's no LICENSE file so... See here for getting these file (you'll need the SDL too), then just make && ./mandelbrot.

  
## main.cpp
#include <cmath>
#include <iostream>
#include <omp.h>
#include <xmmintrin.h>

#define TIMING

#ifdef TIMING
#include <sys/time.h>
#endif

#include "Screen.h"

//Max Iterations before we assume the posize_t will not escape
const size_t MAX_ITS = 1000;
// horizontal resolution
const size_t HXRES = 700;
// vertical resolution
const size_t HYRES = 700;
// max depth of zoom
const size_t MAX_DEPTH = 40;
// zoom between each frame
const float ZOOM_FACTOR = 1.02;

// Change these to zoom size_to different parts of the image
// Centre posize_t we'll zoom on - Real component
const float PX = -0.702295281061;
// Imaginary component
const float PY = +0.350220783400;

// thanks to Vim macros for this long and awful list
unsigned char pal[] = {
    0, 0, 0, 0, 0, 1, 1, 0, 2, 1, 0, 3, 2, 0, 4, 2, 0, 5, 3, 0, 6, 3, 0, 7, 4, 0, 8, 4, 0, 9, 5, 0, 10, 5, 0, 11, 6, 0, 12, 6, 0, 13, 7, 0, 14, 7, 0, 15, 8, 0, 16, 8, 0, 17, 9, 0, 18, 9, 0, 19, 10, 0, 20, 10, 0, 21, 11, 0, 22, 11, 0, 23, 12, 0, 24, 12, 0, 25, 13, 0, 26, 13, 0, 27, 14, 0, 28, 14, 0, 29, 15, 0, 30, 15, 0, 31, 16, 0, 32, 16, 0, 33, 17, 0, 34, 17, 0, 35, 18, 0, 36, 18, 0, 37, 19, 0, 38, 19, 0, 39, 20, 0, 40, 20, 0, 41, 21, 0, 42, 21, 0, 43, 22, 0, 44, 22, 0, 45, 23, 0, 46, 23, 0, 47, 24, 0, 48, 24, 0, 49, 25, 0, 50, 25, 0, 51, 26, 0, 52, 26, 0, 53, 27, 0, 54, 27, 0, 55, 28, 0, 56, 28, 0, 57, 29, 0, 58, 29, 0, 59, 30, 0, 60, 30, 0, 61, 31, 0, 62, 31, 0, 63, 32, 0, 64, 32, 0, 65, 33, 0, 66, 33, 0, 67, 34, 0, 68, 34, 0, 69, 35, 0, 70, 35, 0, 71, 36, 0, 72, 36, 0, 73, 37, 0, 74, 37, 0, 75, 38, 0, 76, 38, 0, 77, 39, 0, 78, 39, 0, 79, 40, 0, 80, 40, 0, 81, 41, 0, 82, 41, 0, 83, 42, 0, 84, 42, 0, 85, 43, 0, 86, 43, 0, 87, 44, 0, 88, 44, 0, 89, 45, 0, 90, 45, 0, 91, 46, 0, 92, 46, 0, 93, 47, 0, 94, 47, 0, 95, 48, 0, 96, 48, 0, 97, 49, 0, 98, 49, 0, 99, 50, 0, 100, 50, 0, 101, 51, 0, 102, 51, 0, 103, 52, 0, 104, 52, 0, 105, 53, 0, 106, 53, 0, 107, 54, 0, 108, 54, 0, 109, 55, 0, 110, 55, 0, 111, 56, 0, 112, 56, 0, 113, 57, 0, 114, 57, 0, 115, 58, 0, 116, 58, 0, 117, 59, 0, 118, 59, 0, 119, 60, 0, 120, 60, 0, 121, 61, 0, 122, 61, 0, 123, 62, 0, 124, 62, 0, 125, 63, 0, 126, 63, 0, 127, 64, 0, 128, 64, 0, 129, 65, 0, 130, 65, 0, 131, 66, 0, 132, 66, 0, 133, 67, 0, 134, 67, 0, 135, 68, 0, 136, 68, 0, 137, 69, 0, 138, 69, 0, 139, 70, 0, 140, 70, 0, 141, 71, 0, 142, 71, 0, 143, 72, 0, 144, 72, 0, 145, 73, 0, 146, 73, 0, 147, 74, 0, 148, 74, 0, 149, 75, 0, 150, 75, 0, 151, 76, 0, 152, 76, 0, 153, 77, 0, 154, 77, 0, 155, 78, 0, 156, 78, 0, 157, 79, 0, 158, 79, 0, 159, 80, 0, 160, 80, 0, 161, 81, 0, 162, 81, 0, 163, 82, 0, 164, 82, 0, 165, 83, 0, 166, 83, 0, 167, 84, 0, 168, 84, 0, 169, 85, 0, 170, 85, 0, 171, 86, 0, 172, 86, 0, 173, 87, 0, 174, 87, 0, 175, 88, 0, 176, 88, 0, 177, 89, 0, 178, 89, 0, 179, 90, 0, 180, 90, 0, 181, 91, 0, 182, 91, 0, 183, 92, 0, 184, 92, 0, 185, 93, 0, 186, 93, 0, 187, 94, 0, 188, 94, 0, 189, 95, 0, 190, 95, 0, 191, 96, 0, 192, 96, 0, 193, 97, 0, 194, 97, 0, 195, 98, 0, 196, 98, 0, 197, 99, 0, 198, 99, 0, 199, 100, 0, 200, 100, 0, 201, 101, 0, 202, 101, 0, 203, 102, 0, 204, 102, 0, 205, 103, 0, 206, 103, 0, 207, 104, 0, 208, 104, 0, 209, 105, 0, 210, 105, 0, 211, 106, 0, 212, 106, 0, 213, 107, 0, 214, 107, 0, 215, 108, 0, 216, 108, 0, 217, 109, 0, 218, 109, 0, 219, 110, 0, 220, 110, 0, 221, 111, 0, 222, 111, 0, 223, 112, 0, 224, 112, 0, 225, 113, 0, 226, 113, 0, 227, 114, 0, 228, 114, 0, 229, 115, 0, 230, 115, 0, 231, 116, 0, 232, 116, 0, 233, 117, 0, 234, 117, 0, 235, 118, 0, 236, 118, 0, 237, 119, 0, 238, 119, 0, 239, 120, 0, 240, 120, 0, 241, 121, 0, 242, 121, 0, 243, 122, 0, 244, 122, 0, 245, 123, 0, 246, 123, 0, 247, 124, 0, 248, 124, 0, 249, 125, 0, 250, 125, 0, 251, 126, 0, 252, 126, 0, 253, 127, 0, 254, 127, 0, 255
};
const size_t PAL_SIZE = 256;

// if defined disable screen.flip()
#define BENCH

// number of threads to use
#define NTHREADS 64

// lots of macros to make code easier to read
// (it even give at LISP-ish feeling with a lots
// of parens and prefix notation :D)
#define ADD(a, b) (_mm_add_ps((a), (b)))
#define AND(a, b) (_mm_and_ps((a), (b)))
#define ANDNOT(a, b) (_mm_andnot_ps((a), (b)))
#define LT(a, b) (_mm_cmplt_ps((a), (b)))
#define MUL(a, b) (_mm_mul_ps((a), (b)))
#define MASK(a) (_mm_movemask_ps((a)))
#define OR(a, b) (_mm_or_ps((a), (b)))
#define SET1(a) (_mm_set1_ps((a)))
#define SETR(a, b, c, d) (_mm_setr_ps((a), (b), (c), (d)))
#define STORE(a, b) (_mm_store_ps((a), (b)))
#define SUB(a, b) (_mm_sub_ps((a), (b)))
#define ZERO (_mm_setzero_ps())

// divisions are expensive so we compute only one time
// and the use multiplications
const __m128 INVERT_HXRES = SET1(1. / HXRES);
const float INVERT_HYRES = 1. / HYRES;

inline __m128 member(__m128 cx, __m128 cy) {
    const __m128 two = SET1(2.),
          four = SET1(4.);

    // more or less the same than the bool member(float, float, int&)
    __m128 x = cx,
           y = cy,
           iterations = SET1(1.),
           x2,
           y2,
           lt;

    // this is a bit "ugly", but it avoid us to recompute
    // x**2, y**2 and lt so it's needed
    for(register size_t it = 0;
            MASK(lt = LT(
                    ADD(
                        (x2 = MUL(x, x)),
                        (y2 = MUL(y, y))
                       ),
                    four
                    )
                ) && it < MAX_ITS;) {
        y = ADD(MUL(two, MUL(x, y)), cy);
        x = ADD(SUB(x2, y2), cx);

        // if we're < 4 we set the number iteration to it+1
        // otherwise we keep the old count
        iterations = OR(
                AND(lt, SET1(++it)),
                ANDNOT(lt, iterations)
                );
    }

    return iterations;
}

int main(int argc, char *argv[]) {
    size_t hx, hy;
    float m = 1.;

    // we declare vars here to avoid redeclaring them in loops
    float quarter_m, four_over_m;
    float yoffset, unpack[4];
    __m128 cx, cy, xoffset, mm128_four_over_m;

    size_t j, hxc, b;

#ifdef TIMING
    struct timeval start_time;
    struct timeval stop_time;
    long long total_time = 0;
#endif

    // Create a screen to render to
    Screen screen = Screen(HXRES, HYRES);

    for(register size_t depth = 0; depth < MAX_DEPTH;
            ++depth, m *= ZOOM_FACTOR) {
#ifdef TIMING
        /* record starting time */
        gettimeofday(&start_time, NULL);
#endif

        // compute the most thing in the outer loops
        four_over_m = 4. / m;
        mm128_four_over_m = SET1(four_over_m);
        quarter_m = m / 4.;
        xoffset = SET1(-0.5 + (PX * quarter_m));
        yoffset = -0.5 + (PY * quarter_m);

        // static schedule allows us to split "fairly" the rows between threads
#pragma omp parallel for private(hx, cx, cy, unpack, j, hxc, b)\
        num_threads(NTHREADS) schedule(static, 1)
        for(hy = 0; hy < HYRES; ++hy) {
            cy = SET1((INVERT_HYRES * hy + yoffset) * four_over_m);

            for(hx = 0; hx < HXRES; hx += 4) {
                cx = MUL(
                        ADD(
                            MUL(
                                INVERT_HXRES,
                                SETR(hx, hx + 1, hx + 2, hx + 3)
                               ),
                            xoffset
                           ),
                        mm128_four_over_m
                        );

                STORE(unpack, member(cx, cy));

                for(j = 0, hxc = hx; j < 4 && hxc < HXRES; ++hxc, ++j) {
                    if(unpack[j] != MAX_ITS) {
                        b = 3 * ((size_t)unpack[j] % PAL_SIZE);

                        screen.putpixel(hx + j,
                                hy,
                                pal[b],
                                pal[b + 1],
                                pal[b + 2]);
                    }

                    else {
                        screen.putpixel(hx + j, hy, 0, 0, 0);
                    }
                }
            }
        }

#ifdef TIMING
        gettimeofday(&stop_time, NULL);
        total_time += (stop_time.tv_sec - start_time.tv_sec) * 1000000L
            + (stop_time.tv_usec - start_time.tv_usec);
        // Show the rendered image on the screen
#endif
        screen.flip();
        // endl flush the buffer: this is slooow
        std::cout << "Render done " << depth << " " << m << "\n";
    }

#ifdef TIMING
    std::cout << "Total executing time " << total_time << " microseconds\n";
#endif
    sleep(10);
    std::cout << "Clean Exit"<< std::endl;

    return 0;
}

## Makefile
CC=g++
CFLAGS=-c -pedantic -Wall -std=c++0x -O3 -fopenmp
LDFLAGS=-fopenmp
SOURCES=main.cpp Screen.cpp
OBJECTS=$(SOURCES:.cpp=.o)
EXECUTABLE=mandelbrot
LIBS=-lSDL

all: $(SOURCES) $(EXECUTABLE)

$(EXECUTABLE): $(OBJECTS)
	$(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@

.cpp.o:
	$(CC) $(CFLAGS) $< -o $@

clean:
	rm -f *.o

mrproper: clean
	$(EXECUTABLE)

run: all
	./$(EXECUTABLE)
	#include <cmath>
	#include <iostream>
	#include <omp.h>
	#include <xmmintrin.h>

	#define TIMING

	#ifdef TIMING
	#include <sys/time.h>
	#endif

	#include "Screen.h"

	//Max Iterations before we assume the posize_t will not escape
	const size_t MAX_ITS = 1000;
	// horizontal resolution
	const size_t HXRES = 700;
	// vertical resolution
	const size_t HYRES = 700;
	// max depth of zoom
	const size_t MAX_DEPTH = 40;
	// zoom between each frame
	const float ZOOM_FACTOR = 1.02;

	// Change these to zoom size_to different parts of the image
	// Centre posize_t we'll zoom on - Real component
	const float PX = -0.702295281061;
	// Imaginary component
	const float PY = +0.350220783400;

	// thanks to Vim macros for this long and awful list
	unsigned char pal[] = {
	0, 0, 0, 0, 0, 1, 1, 0, 2, 1, 0, 3, 2, 0, 4, 2, 0, 5, 3, 0, 6, 3, 0, 7, 4, 0, 8, 4, 0, 9, 5, 0, 10, 5, 0, 11, 6, 0, 12, 6, 0, 13, 7, 0, 14, 7, 0, 15, 8, 0, 16, 8, 0, 17, 9, 0, 18, 9, 0, 19, 10, 0, 20, 10, 0, 21, 11, 0, 22, 11, 0, 23, 12, 0, 24, 12, 0, 25, 13, 0, 26, 13, 0, 27, 14, 0, 28, 14, 0, 29, 15, 0, 30, 15, 0, 31, 16, 0, 32, 16, 0, 33, 17, 0, 34, 17, 0, 35, 18, 0, 36, 18, 0, 37, 19, 0, 38, 19, 0, 39, 20, 0, 40, 20, 0, 41, 21, 0, 42, 21, 0, 43, 22, 0, 44, 22, 0, 45, 23, 0, 46, 23, 0, 47, 24, 0, 48, 24, 0, 49, 25, 0, 50, 25, 0, 51, 26, 0, 52, 26, 0, 53, 27, 0, 54, 27, 0, 55, 28, 0, 56, 28, 0, 57, 29, 0, 58, 29, 0, 59, 30, 0, 60, 30, 0, 61, 31, 0, 62, 31, 0, 63, 32, 0, 64, 32, 0, 65, 33, 0, 66, 33, 0, 67, 34, 0, 68, 34, 0, 69, 35, 0, 70, 35, 0, 71, 36, 0, 72, 36, 0, 73, 37, 0, 74, 37, 0, 75, 38, 0, 76, 38, 0, 77, 39, 0, 78, 39, 0, 79, 40, 0, 80, 40, 0, 81, 41, 0, 82, 41, 0, 83, 42, 0, 84, 42, 0, 85, 43, 0, 86, 43, 0, 87, 44, 0, 88, 44, 0, 89, 45, 0, 90, 45, 0, 91, 46, 0, 92, 46, 0, 93, 47, 0, 94, 47, 0, 95, 48, 0, 96, 48, 0, 97, 49, 0, 98, 49, 0, 99, 50, 0, 100, 50, 0, 101, 51, 0, 102, 51, 0, 103, 52, 0, 104, 52, 0, 105, 53, 0, 106, 53, 0, 107, 54, 0, 108, 54, 0, 109, 55, 0, 110, 55, 0, 111, 56, 0, 112, 56, 0, 113, 57, 0, 114, 57, 0, 115, 58, 0, 116, 58, 0, 117, 59, 0, 118, 59, 0, 119, 60, 0, 120, 60, 0, 121, 61, 0, 122, 61, 0, 123, 62, 0, 124, 62, 0, 125, 63, 0, 126, 63, 0, 127, 64, 0, 128, 64, 0, 129, 65, 0, 130, 65, 0, 131, 66, 0, 132, 66, 0, 133, 67, 0, 134, 67, 0, 135, 68, 0, 136, 68, 0, 137, 69, 0, 138, 69, 0, 139, 70, 0, 140, 70, 0, 141, 71, 0, 142, 71, 0, 143, 72, 0, 144, 72, 0, 145, 73, 0, 146, 73, 0, 147, 74, 0, 148, 74, 0, 149, 75, 0, 150, 75, 0, 151, 76, 0, 152, 76, 0, 153, 77, 0, 154, 77, 0, 155, 78, 0, 156, 78, 0, 157, 79, 0, 158, 79, 0, 159, 80, 0, 160, 80, 0, 161, 81, 0, 162, 81, 0, 163, 82, 0, 164, 82, 0, 165, 83, 0, 166, 83, 0, 167, 84, 0, 168, 84, 0, 169, 85, 0, 170, 85, 0, 171, 86, 0, 172, 86, 0, 173, 87, 0, 174, 87, 0, 175, 88, 0, 176, 88, 0, 177, 89, 0, 178, 89, 0, 179, 90, 0, 180, 90, 0, 181, 91, 0, 182, 91, 0, 183, 92, 0, 184, 92, 0, 185, 93, 0, 186, 93, 0, 187, 94, 0, 188, 94, 0, 189, 95, 0, 190, 95, 0, 191, 96, 0, 192, 96, 0, 193, 97, 0, 194, 97, 0, 195, 98, 0, 196, 98, 0, 197, 99, 0, 198, 99, 0, 199, 100, 0, 200, 100, 0, 201, 101, 0, 202, 101, 0, 203, 102, 0, 204, 102, 0, 205, 103, 0, 206, 103, 0, 207, 104, 0, 208, 104, 0, 209, 105, 0, 210, 105, 0, 211, 106, 0, 212, 106, 0, 213, 107, 0, 214, 107, 0, 215, 108, 0, 216, 108, 0, 217, 109, 0, 218, 109, 0, 219, 110, 0, 220, 110, 0, 221, 111, 0, 222, 111, 0, 223, 112, 0, 224, 112, 0, 225, 113, 0, 226, 113, 0, 227, 114, 0, 228, 114, 0, 229, 115, 0, 230, 115, 0, 231, 116, 0, 232, 116, 0, 233, 117, 0, 234, 117, 0, 235, 118, 0, 236, 118, 0, 237, 119, 0, 238, 119, 0, 239, 120, 0, 240, 120, 0, 241, 121, 0, 242, 121, 0, 243, 122, 0, 244, 122, 0, 245, 123, 0, 246, 123, 0, 247, 124, 0, 248, 124, 0, 249, 125, 0, 250, 125, 0, 251, 126, 0, 252, 126, 0, 253, 127, 0, 254, 127, 0, 255
	};
	const size_t PAL_SIZE = 256;

	// if defined disable screen.flip()
	#define BENCH

	// number of threads to use
	#define NTHREADS 64

	// lots of macros to make code easier to read
	// (it even give at LISP-ish feeling with a lots
	// of parens and prefix notation :D)
	#define ADD(a, b) (_mm_add_ps((a), (b)))
	#define AND(a, b) (_mm_and_ps((a), (b)))
	#define ANDNOT(a, b) (_mm_andnot_ps((a), (b)))
	#define LT(a, b) (_mm_cmplt_ps((a), (b)))
	#define MUL(a, b) (_mm_mul_ps((a), (b)))
	#define MASK(a) (_mm_movemask_ps((a)))
	#define OR(a, b) (_mm_or_ps((a), (b)))
	#define SET1(a) (_mm_set1_ps((a)))
	#define SETR(a, b, c, d) (_mm_setr_ps((a), (b), (c), (d)))
	#define STORE(a, b) (_mm_store_ps((a), (b)))
	#define SUB(a, b) (_mm_sub_ps((a), (b)))
	#define ZERO (_mm_setzero_ps())

	// divisions are expensive so we compute only one time
	// and the use multiplications
	const __m128 INVERT_HXRES = SET1(1. / HXRES);
	const float INVERT_HYRES = 1. / HYRES;

	inline __m128 member(__m128 cx, __m128 cy) {
	const __m128 two = SET1(2.),
	four = SET1(4.);

	// more or less the same than the bool member(float, float, int&)
	__m128 x = cx,
	y = cy,
	iterations = SET1(1.),
	x2,
	y2,
	lt;

	// this is a bit "ugly", but it avoid us to recompute
	// x2, y2 and lt so it's needed
	for(register size_t it = 0;
	MASK(lt = LT(
	ADD(
	(x2 = MUL(x, x)),
	(y2 = MUL(y, y))
	),
	four
	)
	) && it < MAX_ITS;) {
	y = ADD(MUL(two, MUL(x, y)), cy);
	x = ADD(SUB(x2, y2), cx);

	// if we're < 4 we set the number iteration to it+1
	// otherwise we keep the old count
	iterations = OR(
	AND(lt, SET1(++it)),
	ANDNOT(lt, iterations)
	);
	}

	return iterations;
	}

	int main(int argc, char *argv[]) {
	size_t hx, hy;
	float m = 1.;

	// we declare vars here to avoid redeclaring them in loops
	float quarter_m, four_over_m;
	float yoffset, unpack[4];
	__m128 cx, cy, xoffset, mm128_four_over_m;

	size_t j, hxc, b;

	#ifdef TIMING
	struct timeval start_time;
	struct timeval stop_time;
	long long total_time = 0;
	#endif

	// Create a screen to render to
	Screen screen = Screen(HXRES, HYRES);

	for(register size_t depth = 0; depth < MAX_DEPTH;
	++depth, m *= ZOOM_FACTOR) {
	#ifdef TIMING
	/* record starting time */
	gettimeofday(&start_time, NULL);
	#endif

	// compute the most thing in the outer loops
	four_over_m = 4. / m;
	mm128_four_over_m = SET1(four_over_m);
	quarter_m = m / 4.;
	xoffset = SET1(-0.5 + (PX * quarter_m));
	yoffset = -0.5 + (PY * quarter_m);

	// static schedule allows us to split "fairly" the rows between threads
	#pragma omp parallel for private(hx, cx, cy, unpack, j, hxc, b)\
	num_threads(NTHREADS) schedule(static, 1)
	for(hy = 0; hy < HYRES; ++hy) {
	cy = SET1((INVERT_HYRES * hy + yoffset) * four_over_m);

	for(hx = 0; hx < HXRES; hx += 4) {
	cx = MUL(
	ADD(
	MUL(
	INVERT_HXRES,
	SETR(hx, hx + 1, hx + 2, hx + 3)
	),
	xoffset
	),
	mm128_four_over_m
	);

	STORE(unpack, member(cx, cy));

	for(j = 0, hxc = hx; j < 4 && hxc < HXRES; ++hxc, ++j) {
	if(unpack[j] != MAX_ITS) {
	b = 3 * ((size_t)unpack[j] % PAL_SIZE);

	screen.putpixel(hx + j,
	hy,
	pal[b],
	pal[b + 1],
	pal[b + 2]);
	}

	else {
	screen.putpixel(hx + j, hy, 0, 0, 0);
	}
	}
	}
	}

	#ifdef TIMING
	gettimeofday(&stop_time, NULL);
	total_time += (stop_time.tv_sec - start_time.tv_sec) * 1000000L
	+ (stop_time.tv_usec - start_time.tv_usec);
	// Show the rendered image on the screen
	#endif
	screen.flip();
	// endl flush the buffer: this is slooow
	std::cout << "Render done " << depth << " " << m << "\n";
	}

	#ifdef TIMING
	std::cout << "Total executing time " << total_time << " microseconds\n";
	#endif
	sleep(10);
	std::cout << "Clean Exit"<< std::endl;

	return 0;
	}
	CC=g++
	CFLAGS=-c -pedantic -Wall -std=c++0x -O3 -fopenmp
	LDFLAGS=-fopenmp
	SOURCES=main.cpp Screen.cpp
	OBJECTS=$(SOURCES:.cpp=.o)
	EXECUTABLE=mandelbrot
	LIBS=-lSDL

	all: $(SOURCES) $(EXECUTABLE)

	$(EXECUTABLE): $(OBJECTS)
	$(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@

	.cpp.o:
	$(CC) $(CFLAGS) $< -o $@

	clean:
	rm -f *.o

	mrproper: clean
	$(EXECUTABLE)

	run: all
	./$(EXECUTABLE)