Skip to content

Instantly share code, notes, and snippets.

@Alexis-D
Created March 24, 2012 12:51
Show Gist options
  • Save Alexis-D/2182352 to your computer and use it in GitHub Desktop.
Save Alexis-D/2182352 to your computer and use it in GitHub Desktop.
Mandelbrot Set with OpenMP and SSE

Mandelbrot Set

Parallel generation of the Mandelbrot set with OpenMP, SSE and various small/ugly/hacky optimizations.

Perfs?

It's pretty fast on a 64 cores machines ~1sec for 40 700x700 frames. It could be faster with mathematical optimizations but the goal here was just to get "code" optimizations...

Compile

I'm not the author of Screen.{cpp,h} and there's no LICENSE file so... See here for getting these file (you'll need the SDL too), then just make && ./mandelbrot.

#include <cmath>
#include <iostream>
#include <omp.h>
#include <xmmintrin.h>
#define TIMING
#ifdef TIMING
#include <sys/time.h>
#endif
#include "Screen.h"
//Max Iterations before we assume the posize_t will not escape
const size_t MAX_ITS = 1000;
// horizontal resolution
const size_t HXRES = 700;
// vertical resolution
const size_t HYRES = 700;
// max depth of zoom
const size_t MAX_DEPTH = 40;
// zoom between each frame
const float ZOOM_FACTOR = 1.02;
// Change these to zoom size_to different parts of the image
// Centre posize_t we'll zoom on - Real component
const float PX = -0.702295281061;
// Imaginary component
const float PY = +0.350220783400;
// thanks to Vim macros for this long and awful list
unsigned char pal[] = {
0, 0, 0, 0, 0, 1, 1, 0, 2, 1, 0, 3, 2, 0, 4, 2, 0, 5, 3, 0, 6, 3, 0, 7, 4, 0, 8, 4, 0, 9, 5, 0, 10, 5, 0, 11, 6, 0, 12, 6, 0, 13, 7, 0, 14, 7, 0, 15, 8, 0, 16, 8, 0, 17, 9, 0, 18, 9, 0, 19, 10, 0, 20, 10, 0, 21, 11, 0, 22, 11, 0, 23, 12, 0, 24, 12, 0, 25, 13, 0, 26, 13, 0, 27, 14, 0, 28, 14, 0, 29, 15, 0, 30, 15, 0, 31, 16, 0, 32, 16, 0, 33, 17, 0, 34, 17, 0, 35, 18, 0, 36, 18, 0, 37, 19, 0, 38, 19, 0, 39, 20, 0, 40, 20, 0, 41, 21, 0, 42, 21, 0, 43, 22, 0, 44, 22, 0, 45, 23, 0, 46, 23, 0, 47, 24, 0, 48, 24, 0, 49, 25, 0, 50, 25, 0, 51, 26, 0, 52, 26, 0, 53, 27, 0, 54, 27, 0, 55, 28, 0, 56, 28, 0, 57, 29, 0, 58, 29, 0, 59, 30, 0, 60, 30, 0, 61, 31, 0, 62, 31, 0, 63, 32, 0, 64, 32, 0, 65, 33, 0, 66, 33, 0, 67, 34, 0, 68, 34, 0, 69, 35, 0, 70, 35, 0, 71, 36, 0, 72, 36, 0, 73, 37, 0, 74, 37, 0, 75, 38, 0, 76, 38, 0, 77, 39, 0, 78, 39, 0, 79, 40, 0, 80, 40, 0, 81, 41, 0, 82, 41, 0, 83, 42, 0, 84, 42, 0, 85, 43, 0, 86, 43, 0, 87, 44, 0, 88, 44, 0, 89, 45, 0, 90, 45, 0, 91, 46, 0, 92, 46, 0, 93, 47, 0, 94, 47, 0, 95, 48, 0, 96, 48, 0, 97, 49, 0, 98, 49, 0, 99, 50, 0, 100, 50, 0, 101, 51, 0, 102, 51, 0, 103, 52, 0, 104, 52, 0, 105, 53, 0, 106, 53, 0, 107, 54, 0, 108, 54, 0, 109, 55, 0, 110, 55, 0, 111, 56, 0, 112, 56, 0, 113, 57, 0, 114, 57, 0, 115, 58, 0, 116, 58, 0, 117, 59, 0, 118, 59, 0, 119, 60, 0, 120, 60, 0, 121, 61, 0, 122, 61, 0, 123, 62, 0, 124, 62, 0, 125, 63, 0, 126, 63, 0, 127, 64, 0, 128, 64, 0, 129, 65, 0, 130, 65, 0, 131, 66, 0, 132, 66, 0, 133, 67, 0, 134, 67, 0, 135, 68, 0, 136, 68, 0, 137, 69, 0, 138, 69, 0, 139, 70, 0, 140, 70, 0, 141, 71, 0, 142, 71, 0, 143, 72, 0, 144, 72, 0, 145, 73, 0, 146, 73, 0, 147, 74, 0, 148, 74, 0, 149, 75, 0, 150, 75, 0, 151, 76, 0, 152, 76, 0, 153, 77, 0, 154, 77, 0, 155, 78, 0, 156, 78, 0, 157, 79, 0, 158, 79, 0, 159, 80, 0, 160, 80, 0, 161, 81, 0, 162, 81, 0, 163, 82, 0, 164, 82, 0, 165, 83, 0, 166, 83, 0, 167, 84, 0, 168, 84, 0, 169, 85, 0, 170, 85, 0, 171, 86, 0, 172, 86, 0, 173, 87, 0, 174, 87, 0, 175, 88, 0, 176, 88, 0, 177, 89, 0, 178, 89, 0, 179, 90, 0, 180, 90, 0, 181, 91, 0, 182, 91, 0, 183, 92, 0, 184, 92, 0, 185, 93, 0, 186, 93, 0, 187, 94, 0, 188, 94, 0, 189, 95, 0, 190, 95, 0, 191, 96, 0, 192, 96, 0, 193, 97, 0, 194, 97, 0, 195, 98, 0, 196, 98, 0, 197, 99, 0, 198, 99, 0, 199, 100, 0, 200, 100, 0, 201, 101, 0, 202, 101, 0, 203, 102, 0, 204, 102, 0, 205, 103, 0, 206, 103, 0, 207, 104, 0, 208, 104, 0, 209, 105, 0, 210, 105, 0, 211, 106, 0, 212, 106, 0, 213, 107, 0, 214, 107, 0, 215, 108, 0, 216, 108, 0, 217, 109, 0, 218, 109, 0, 219, 110, 0, 220, 110, 0, 221, 111, 0, 222, 111, 0, 223, 112, 0, 224, 112, 0, 225, 113, 0, 226, 113, 0, 227, 114, 0, 228, 114, 0, 229, 115, 0, 230, 115, 0, 231, 116, 0, 232, 116, 0, 233, 117, 0, 234, 117, 0, 235, 118, 0, 236, 118, 0, 237, 119, 0, 238, 119, 0, 239, 120, 0, 240, 120, 0, 241, 121, 0, 242, 121, 0, 243, 122, 0, 244, 122, 0, 245, 123, 0, 246, 123, 0, 247, 124, 0, 248, 124, 0, 249, 125, 0, 250, 125, 0, 251, 126, 0, 252, 126, 0, 253, 127, 0, 254, 127, 0, 255
};
const size_t PAL_SIZE = 256;
// if defined disable screen.flip()
#define BENCH
// number of threads to use
#define NTHREADS 64
// lots of macros to make code easier to read
// (it even give at LISP-ish feeling with a lots
// of parens and prefix notation :D)
#define ADD(a, b) (_mm_add_ps((a), (b)))
#define AND(a, b) (_mm_and_ps((a), (b)))
#define ANDNOT(a, b) (_mm_andnot_ps((a), (b)))
#define LT(a, b) (_mm_cmplt_ps((a), (b)))
#define MUL(a, b) (_mm_mul_ps((a), (b)))
#define MASK(a) (_mm_movemask_ps((a)))
#define OR(a, b) (_mm_or_ps((a), (b)))
#define SET1(a) (_mm_set1_ps((a)))
#define SETR(a, b, c, d) (_mm_setr_ps((a), (b), (c), (d)))
#define STORE(a, b) (_mm_store_ps((a), (b)))
#define SUB(a, b) (_mm_sub_ps((a), (b)))
#define ZERO (_mm_setzero_ps())
// divisions are expensive so we compute only one time
// and the use multiplications
const __m128 INVERT_HXRES = SET1(1. / HXRES);
const float INVERT_HYRES = 1. / HYRES;
inline __m128 member(__m128 cx, __m128 cy) {
const __m128 two = SET1(2.),
four = SET1(4.);
// more or less the same than the bool member(float, float, int&)
__m128 x = cx,
y = cy,
iterations = SET1(1.),
x2,
y2,
lt;
// this is a bit "ugly", but it avoid us to recompute
// x**2, y**2 and lt so it's needed
for(register size_t it = 0;
MASK(lt = LT(
ADD(
(x2 = MUL(x, x)),
(y2 = MUL(y, y))
),
four
)
) && it < MAX_ITS;) {
y = ADD(MUL(two, MUL(x, y)), cy);
x = ADD(SUB(x2, y2), cx);
// if we're < 4 we set the number iteration to it+1
// otherwise we keep the old count
iterations = OR(
AND(lt, SET1(++it)),
ANDNOT(lt, iterations)
);
}
return iterations;
}
int main(int argc, char *argv[]) {
size_t hx, hy;
float m = 1.;
// we declare vars here to avoid redeclaring them in loops
float quarter_m, four_over_m;
float yoffset, unpack[4];
__m128 cx, cy, xoffset, mm128_four_over_m;
size_t j, hxc, b;
#ifdef TIMING
struct timeval start_time;
struct timeval stop_time;
long long total_time = 0;
#endif
// Create a screen to render to
Screen screen = Screen(HXRES, HYRES);
for(register size_t depth = 0; depth < MAX_DEPTH;
++depth, m *= ZOOM_FACTOR) {
#ifdef TIMING
/* record starting time */
gettimeofday(&start_time, NULL);
#endif
// compute the most thing in the outer loops
four_over_m = 4. / m;
mm128_four_over_m = SET1(four_over_m);
quarter_m = m / 4.;
xoffset = SET1(-0.5 + (PX * quarter_m));
yoffset = -0.5 + (PY * quarter_m);
// static schedule allows us to split "fairly" the rows between threads
#pragma omp parallel for private(hx, cx, cy, unpack, j, hxc, b)\
num_threads(NTHREADS) schedule(static, 1)
for(hy = 0; hy < HYRES; ++hy) {
cy = SET1((INVERT_HYRES * hy + yoffset) * four_over_m);
for(hx = 0; hx < HXRES; hx += 4) {
cx = MUL(
ADD(
MUL(
INVERT_HXRES,
SETR(hx, hx + 1, hx + 2, hx + 3)
),
xoffset
),
mm128_four_over_m
);
STORE(unpack, member(cx, cy));
for(j = 0, hxc = hx; j < 4 && hxc < HXRES; ++hxc, ++j) {
if(unpack[j] != MAX_ITS) {
b = 3 * ((size_t)unpack[j] % PAL_SIZE);
screen.putpixel(hx + j,
hy,
pal[b],
pal[b + 1],
pal[b + 2]);
}
else {
screen.putpixel(hx + j, hy, 0, 0, 0);
}
}
}
}
#ifdef TIMING
gettimeofday(&stop_time, NULL);
total_time += (stop_time.tv_sec - start_time.tv_sec) * 1000000L
+ (stop_time.tv_usec - start_time.tv_usec);
// Show the rendered image on the screen
#endif
screen.flip();
// endl flush the buffer: this is slooow
std::cout << "Render done " << depth << " " << m << "\n";
}
#ifdef TIMING
std::cout << "Total executing time " << total_time << " microseconds\n";
#endif
sleep(10);
std::cout << "Clean Exit"<< std::endl;
return 0;
}
CC=g++
CFLAGS=-c -pedantic -Wall -std=c++0x -O3 -fopenmp
LDFLAGS=-fopenmp
SOURCES=main.cpp Screen.cpp
OBJECTS=$(SOURCES:.cpp=.o)
EXECUTABLE=mandelbrot
LIBS=-lSDL
all: $(SOURCES) $(EXECUTABLE)
$(EXECUTABLE): $(OBJECTS)
$(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@
.cpp.o:
$(CC) $(CFLAGS) $< -o $@
clean:
rm -f *.o
mrproper: clean
$(EXECUTABLE)
run: all
./$(EXECUTABLE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment