Volcanoscar/stackblur.cpp

## stackblur.cpp
/*
 * "stackblur"
 * originally, Mario Klingemann, mario@quasimondo.com
 * this implementation, Benjamin Yates (benjamin@rqdq.com)
 * http://incubator.quasimondo.com/processing/stackblur.pde
 *
 * this blur routine seems to be quite popular.
 *
 * unfortunately, mario didn't comment anything.
 * but, it's easy to see it's just a flavor of a two-pass
 * sliding box kernel.
 *
 * this version is vectorized for float32 r/g/b/a using sse
 *
 * vec4() is just a class wrapping _mm_zzz_ps() family of SSE intrinsics
 * ( if you need one, start here:
 *  - http://fastcpp.blogspot.com/2011/12/simple-vector3-class-with-sse-support.html )
 *
 * radius is in (whole) pixels
 *
 */

void stackblur( vec4* colorbuffer, const int w, const int h, const int radius )
{
	if ( radius < 1 ) return;	// nothing to do

	vec4* const pix = reinterpret_cast<vec4*>(image.c);

	// some values for convenience
	const int w = image.width;
	const int h = image.height;
	const int wm = w-1;
	const int hm = h-1;
	const int wh = w*h;
	const int r1 = radius + 1;

	// number of divisions in the kernel
	// D(-r), D(-r+1), ... D(0), ... D(r-1), D(r)
	const int div = (radius * 2) + 1;

	// temporary output space for first pass.
	vec4* tsurface = new vec4[wh];

	// lookup table for clamping pixel offsets
	// as the kernel passes the right (or lower) edge
	// of the input data
	int* const vmin = new int[ max(w,h) ];

	// calculate divisor for pulling an output from the kernel
	//   the kernel is pyramid shaped.
	//   to understand this divisor, imagine a cross-section
	//   of the center of the pyramid.
	//   (or ask mario to document it better)
	//
	// I store the reciprocal in the vector to
	// mul_ps later instead of div_ps
	int divsumi = ( div + 1 ) >> 1;
	divsumi *= divsumi;
	const vec4 divsum( 1.0f / (float)divsumi );

	// kernel pixel data "stack"
	// which works more like a ring-buffer,
	// where the pointer is advanced each iteration, modulo #divisions
	vec4* stack = new vec4[ div ];
	int stackpointer;
	int stackstart;


	// input/output offsets.
	// they are discrete because the source can be non-square
	int yw = 0;	// current read offset in source
	int yi = 0; // current write offset in destination


	// blur horizontally, output to temporary buffer
	for ( int y=0; y<h; y++ ) {

		vec4 insum(0);
		vec4 outsum(0);
		vec4 sum(0);

		// preload the kernel(stack)
		// pixels before the left edge of the image are
		// samples of [0] (max()).  min() handles
		// images which are smaller than the kernel.
		for ( int i=-radius; i<=radius; i++ ) {

			// calcualte address of source pixel
			const vec4& p = pix[ yi + min(wm,max(i,0)) ];

			// put pixel in the stack
			vec4& sir = stack[ i + radius ];
			sir = p;

			// rbs is a weight from (1)...(radius+1)...(1)
			const int rbs = r1 - abs(i);

			// add pixel to accumulators
			sum += sir * rbs;
			if ( i > 0 ) { insum  += sir; }
			        else { outsum += sir; }
		}


		// now that the kernel is preloaded
		// stackpointer is the index of the center of the kernel
		stackpointer = radius;


		// now start outputing pixels
		for ( int x=0; x<w; x++ ) {

			// output a pixel
			tsurface[yi] = sum * divsum;


			// calculate address of the next pixel
			// to add to the kernel
			//
			// on first pass, make a lut to handle access
			// past the right edge of the width image.
			// min() will cause the last pixel to repeat.
			if ( y == 0 ) vmin[x] = min( x+radius+1, wm );
			vec4& p = pix[ yw + vmin[x] ];


			// remove "past" pixels from the sum
			sum -= outsum;


			// remove "left" side of stack from outsum
			stackstart = stackpointer - radius + div;
			vec4& sir = stack[ stackstart % div ];
			outsum -= sir;

			// now this (same) stack entry is the "right" side
			// add new pixel to the stack, and update accumulators
			sir = p;
			insum += sir;
			sum += insum;


			// slide kernel one pixel ahead (right),
			// update accumulators again
			stackpointer = (stackpointer+1)%div;
			const vec4& sirnext = stack[ stackpointer ];
			outsum += sirnext;
			insum -= sirnext;

			yi++;	// advance output offset to next pixel
		}
		yw += w;	// advance input offset to next line
	}//y loop


	// now blur vertically from the temporary
	// buffer, using the original image buffer
	// as the output
	for ( int x=0; x<w; x++ ) {

		vec4 insum(0);
		vec4 outsum(0);
		vec4 sum(0);

		//preload the stack
		int yp = -radius * w;
		for ( int i = -radius; i<=radius; i++ ) {

			vec4& sir = stack[ i + radius ];

			yi = max(0,yp)+x;
			const vec4& p = tsurface[ yi ];

			sir = p;

			const int rbs = r1 - abs(i);
			sum += sir * rbs;
			if ( i > 0 ) { insum  += sir; }
			        else { outsum += sir; }

			// only advance to the next row if there
			// are still more rows of image left.
			// otherwise, we keep sampling the same row
			// as if the bottom line was duplicated
			if ( i < hm ) yp += w;

		}//preload

		stackpointer = radius;

		// this loop is the same as the y-loop,
		// except the src/dest offsets are calcuated differently
		yi = x;	// set starting output offset by column
		for ( int y=0; y<h; y++ ) {

			pix[yi] = sum * divsum;

			stackstart = stackpointer - radius + div;
			vec4& sir = stack[ stackstart % div ];
			sum -= outsum;
			outsum -= sir;

			if ( x == 0 ) vmin[y] = min(y+r1,hm)*w;
			sir = tsurface[ x + vmin[y] ];

			insum += sir;
			sum += insum;

			stackpointer = (stackpointer+1)%div;
			const vec4& sirnext = stack[ stackpointer ];
			outsum += sirnext;
			insum -= sirnext;

			yi += w; // advance output offset to next line
		}
	}//x loop

	delete [] vmin;
	delete [] stack;
	delete [] tsurface;

}
	/*
	* "stackblur"
	* originally, Mario Klingemann, mario@quasimondo.com
	* this implementation, Benjamin Yates (benjamin@rqdq.com)
	* http://incubator.quasimondo.com/processing/stackblur.pde
	*
	* this blur routine seems to be quite popular.
	*
	* unfortunately, mario didn't comment anything.
	* but, it's easy to see it's just a flavor of a two-pass
	* sliding box kernel.
	*
	* this version is vectorized for float32 r/g/b/a using sse
	*
	* vec4() is just a class wrapping _mm_zzz_ps() family of SSE intrinsics
	* ( if you need one, start here:
	* - http://fastcpp.blogspot.com/2011/12/simple-vector3-class-with-sse-support.html )
	*
	* radius is in (whole) pixels
	*
	*/

	void stackblur( vec4* colorbuffer, const int w, const int h, const int radius )
	{
	if ( radius < 1 ) return; // nothing to do

	vec4* const pix = reinterpret_cast<vec4*>(image.c);

	// some values for convenience
	const int w = image.width;
	const int h = image.height;
	const int wm = w-1;
	const int hm = h-1;
	const int wh = w*h;
	const int r1 = radius + 1;

	// number of divisions in the kernel
	// D(-r), D(-r+1), ... D(0), ... D(r-1), D(r)
	const int div = (radius * 2) + 1;

	// temporary output space for first pass.
	vec4* tsurface = new vec4[wh];

	// lookup table for clamping pixel offsets
	// as the kernel passes the right (or lower) edge
	// of the input data
	int* const vmin = new int[ max(w,h) ];

	// calculate divisor for pulling an output from the kernel
	// the kernel is pyramid shaped.
	// to understand this divisor, imagine a cross-section
	// of the center of the pyramid.
	// (or ask mario to document it better)
	//
	// I store the reciprocal in the vector to
	// mul_ps later instead of div_ps
	int divsumi = ( div + 1 ) >> 1;
	divsumi *= divsumi;
	const vec4 divsum( 1.0f / (float)divsumi );

	// kernel pixel data "stack"
	// which works more like a ring-buffer,
	// where the pointer is advanced each iteration, modulo #divisions
	vec4* stack = new vec4[ div ];
	int stackpointer;
	int stackstart;



	// input/output offsets.
	// they are discrete because the source can be non-square
	int yw = 0; // current read offset in source
	int yi = 0; // current write offset in destination


	// blur horizontally, output to temporary buffer
	for ( int y=0; y<h; y++ ) {

	vec4 insum(0);
	vec4 outsum(0);
	vec4 sum(0);

	// preload the kernel(stack)
	// pixels before the left edge of the image are
	// samples of [0] (max()). min() handles
	// images which are smaller than the kernel.
	for ( int i=-radius; i<=radius; i++ ) {

	// calcualte address of source pixel
	const vec4& p = pix[ yi + min(wm,max(i,0)) ];

	// put pixel in the stack
	vec4& sir = stack[ i + radius ];
	sir = p;

	// rbs is a weight from (1)...(radius+1)...(1)
	const int rbs = r1 - abs(i);

	// add pixel to accumulators
	sum += sir * rbs;
	if ( i > 0 ) { insum += sir; }
	else { outsum += sir; }
	}


	// now that the kernel is preloaded
	// stackpointer is the index of the center of the kernel
	stackpointer = radius;


	// now start outputing pixels
	for ( int x=0; x<w; x++ ) {

	// output a pixel
	tsurface[yi] = sum * divsum;


	// calculate address of the next pixel
	// to add to the kernel
	//
	// on first pass, make a lut to handle access
	// past the right edge of the width image.
	// min() will cause the last pixel to repeat.
	if ( y == 0 ) vmin[x] = min( x+radius+1, wm );
	vec4& p = pix[ yw + vmin[x] ];


	// remove "past" pixels from the sum
	sum -= outsum;


	// remove "left" side of stack from outsum
	stackstart = stackpointer - radius + div;
	vec4& sir = stack[ stackstart % div ];
	outsum -= sir;

	// now this (same) stack entry is the "right" side
	// add new pixel to the stack, and update accumulators
	sir = p;
	insum += sir;
	sum += insum;


	// slide kernel one pixel ahead (right),
	// update accumulators again
	stackpointer = (stackpointer+1)%div;
	const vec4& sirnext = stack[ stackpointer ];
	outsum += sirnext;
	insum -= sirnext;

	yi++; // advance output offset to next pixel
	}
	yw += w; // advance input offset to next line
	}//y loop



	// now blur vertically from the temporary
	// buffer, using the original image buffer
	// as the output
	for ( int x=0; x<w; x++ ) {

	vec4 insum(0);
	vec4 outsum(0);
	vec4 sum(0);

	//preload the stack
	int yp = -radius * w;
	for ( int i = -radius; i<=radius; i++ ) {

	vec4& sir = stack[ i + radius ];

	yi = max(0,yp)+x;
	const vec4& p = tsurface[ yi ];

	sir = p;

	const int rbs = r1 - abs(i);
	sum += sir * rbs;
	if ( i > 0 ) { insum += sir; }
	else { outsum += sir; }

	// only advance to the next row if there
	// are still more rows of image left.
	// otherwise, we keep sampling the same row
	// as if the bottom line was duplicated
	if ( i < hm ) yp += w;

	}//preload

	stackpointer = radius;

	// this loop is the same as the y-loop,
	// except the src/dest offsets are calcuated differently
	yi = x; // set starting output offset by column
	for ( int y=0; y<h; y++ ) {

	pix[yi] = sum * divsum;

	stackstart = stackpointer - radius + div;
	vec4& sir = stack[ stackstart % div ];
	sum -= outsum;
	outsum -= sir;

	if ( x == 0 ) vmin[y] = min(y+r1,hm)*w;
	sir = tsurface[ x + vmin[y] ];

	insum += sir;
	sum += insum;

	stackpointer = (stackpointer+1)%div;
	const vec4& sirnext = stack[ stackpointer ];
	outsum += sirnext;
	insum -= sirnext;

	yi += w; // advance output offset to next line
	}
	}//x loop

	delete [] vmin;
	delete [] stack;
	delete [] tsurface;

	}