jrk/norm_xcorr.cpp

## norm_xcorr.cpp
// This algorithm is meant to be equivalent to CV_TM_CCOEFF_NORMED in OpenCV, and normxcorr2 in MATLAB:
// via http://stackoverflow.com/q/31060974/3815

#include <Halide.h>

typedef uint8_t pixel_t;

void normxcorr( Halide::ImageParam input,
                Halide::ImageParam kernel,
                Halide::Param<pixel_t> kernel_mean,
                Halide::Param<pixel_t> kernel_var,
                Halide::Func& output )
{
    Halide::Var x, y;
    Halide::RDom rk( kernel );

    // reduction domain for cumulative sums
    Halide::RDom ri( 1, input.width() - kernel.width() - 1,
                     1, input.height() - kernel.height() - 1 );

    Halide::Func input_32,
                 bounded_input,
                 kernel_32,
                 knorm,
                 conv,
                 normxcorr_internal,
                 sq_sum_x,
                 sq_sum_x_local,
                 sq_sum_y,
                 sq_sum_y_local,
                 sum_x,
                 sum_x_local,
                 sum_y,
                 sum_y_local,
                 win_var,
                 win_mean;

    Halide::Expr ksize = kernel.width() * kernel.height();

    // accessing outside the input image always returns 0
    bounded_input( x, y ) = Halide::BoundaryConditions::constant_exterior( input, 0 )( x, y );

    // cast to 32-bit to make room for multiplication
    input_32( x, y ) = Halide::cast<int32_t>( bounded_input( x, y ) );
    kernel_32( x, y ) = Halide::cast<int32_t>( kernel( x, y ) );

    // cumulative sum along each row
    sum_x( x, y ) = input_32( x, y );
    sum_x( ri.x, ri.y ) += sum_x( ri.x - 1, ri.y );

    // sum of 1 x W strips
    // (W is the width of the kernel)
    sum_x_local( x, y ) = sum_x( x + kernel.width() - 1, y ) - sum_x( x - 1, y );

    // cumulative sums of the 1 x W strips along each column
    sum_y( x, y ) = sum_x_local( x, y );
    sum_y( ri.x, ri.y ) += sum_y( ri.x, ri.y - 1);

    // sums up H strips (as above) to get the sum of an H x W rectangle
    // (H is the height of the kernel)
    sum_y_local( x, y ) = sum_y( x, y + kernel.height() - 1 ) - sum_y( x, y - 1 );

    // same as above, just with squared image values
    sq_sum_x( x, y ) = input_32( x, y ) * input_32( x, y );
    sq_sum_x( ri.x, ri.y ) += sq_sum_x( ri.x - 1, ri.y );

    sq_sum_x_local( x, y ) = sq_sum_x( x + kernel.width() - 1, y ) - sq_sum_x( x - 1, y );

    sq_sum_y( x, y ) = sq_sum_x_local( x, y );
    sq_sum_y( ri.x, ri.y ) += sq_sum_y( ri.x, ri.y - 1);

    sq_sum_y_local( x, y ) = sq_sum_y( x, y + kernel.height() - 1 ) - sq_sum_y( x, y - 1 );

    // the mean value of each window
    win_mean( x, y ) = sum_y_local( x, y ) / ksize;

    // the variance of each window
    win_var( x, y ) =  sq_sum_y_local( x, y ) / ksize;
    win_var( x, y) -= win_mean( x, y ) * win_mean( x, y );

    // partially normalize the kernel
    // (we'll divide by std. dev. at the end)
    knorm( x, y ) = kernel_32( x, y ) - kernel_mean;

    // convolve kernel and the input
    conv( x, y ) = Halide::sum( knorm( rk.x, rk.y ) * input_32( x + rk.x, y + rk.y ) );

    // calculate normxcorr, except scaled to 0 to 254 (for an 8-bit image)
    normxcorr_internal( x, y ) = conv( x, y ) * 127 / Halide::sqrt( kernel_var * win_var( x, y ) ) + 127;

    // after scaling pixel values, it's safe to cast down to 8-bit
    output( x, y ) = Halide::cast<pixel_t>( normxcorr_internal( x, y ) );

    knorm.compute_root();
    input_32.compute_root();
    sum_x.compute_root();
    sum_y.compute_root();
    sq_sum_x.compute_root();
    sq_sum_y.compute_root();

#if 0
    sum_x.trace_realizations();
    sum_y.trace_realizations();
    sq_sum_x.trace_realizations();
    sq_sum_y.trace_realizations();
#endif

#if 0
    sum_x.vectorize(x, 32).parallel(y);
    sum_y.vectorize(x, 32).parallel(y);
    sq_sum_x.vectorize(x, 32).parallel(y);
    sq_sum_y.vectorize(x, 32).parallel(y);
#endif
}

int main (int argc, char const *argv[])
{
    Halide::Func gen;
    Halide::ImageParam in(Halide::UInt(8), 2), k(Halide::UInt(8), 2);
    Halide::Param<pixel_t> kMean, kVar;

    normxcorr(in, k, kMean, kVar, gen);

    gen.compile_to_file("normxcorr", {in, k, kMean, kVar});

    return 0;
}
	// This algorithm is meant to be equivalent to CV_TM_CCOEFF_NORMED in OpenCV, and normxcorr2 in MATLAB:
	// via http://stackoverflow.com/q/31060974/3815

	#include <Halide.h>

	typedef uint8_t pixel_t;

	void normxcorr( Halide::ImageParam input,
	Halide::ImageParam kernel,
	Halide::Param<pixel_t> kernel_mean,
	Halide::Param<pixel_t> kernel_var,
	Halide::Func& output )
	{
	Halide::Var x, y;
	Halide::RDom rk( kernel );

	// reduction domain for cumulative sums
	Halide::RDom ri( 1, input.width() - kernel.width() - 1,
	1, input.height() - kernel.height() - 1 );

	Halide::Func input_32,
	bounded_input,
	kernel_32,
	knorm,
	conv,
	normxcorr_internal,
	sq_sum_x,
	sq_sum_x_local,
	sq_sum_y,
	sq_sum_y_local,
	sum_x,
	sum_x_local,
	sum_y,
	sum_y_local,
	win_var,
	win_mean;

	Halide::Expr ksize = kernel.width() * kernel.height();

	// accessing outside the input image always returns 0
	bounded_input( x, y ) = Halide::BoundaryConditions::constant_exterior( input, 0 )( x, y );

	// cast to 32-bit to make room for multiplication
	input_32( x, y ) = Halide::cast<int32_t>( bounded_input( x, y ) );
	kernel_32( x, y ) = Halide::cast<int32_t>( kernel( x, y ) );

	// cumulative sum along each row
	sum_x( x, y ) = input_32( x, y );
	sum_x( ri.x, ri.y ) += sum_x( ri.x - 1, ri.y );

	// sum of 1 x W strips
	// (W is the width of the kernel)
	sum_x_local( x, y ) = sum_x( x + kernel.width() - 1, y ) - sum_x( x - 1, y );

	// cumulative sums of the 1 x W strips along each column
	sum_y( x, y ) = sum_x_local( x, y );
	sum_y( ri.x, ri.y ) += sum_y( ri.x, ri.y - 1);

	// sums up H strips (as above) to get the sum of an H x W rectangle
	// (H is the height of the kernel)
	sum_y_local( x, y ) = sum_y( x, y + kernel.height() - 1 ) - sum_y( x, y - 1 );

	// same as above, just with squared image values
	sq_sum_x( x, y ) = input_32( x, y ) * input_32( x, y );
	sq_sum_x( ri.x, ri.y ) += sq_sum_x( ri.x - 1, ri.y );

	sq_sum_x_local( x, y ) = sq_sum_x( x + kernel.width() - 1, y ) - sq_sum_x( x - 1, y );

	sq_sum_y( x, y ) = sq_sum_x_local( x, y );
	sq_sum_y( ri.x, ri.y ) += sq_sum_y( ri.x, ri.y - 1);

	sq_sum_y_local( x, y ) = sq_sum_y( x, y + kernel.height() - 1 ) - sq_sum_y( x, y - 1 );

	// the mean value of each window
	win_mean( x, y ) = sum_y_local( x, y ) / ksize;

	// the variance of each window
	win_var( x, y ) = sq_sum_y_local( x, y ) / ksize;
	win_var( x, y) -= win_mean( x, y ) * win_mean( x, y );

	// partially normalize the kernel
	// (we'll divide by std. dev. at the end)
	knorm( x, y ) = kernel_32( x, y ) - kernel_mean;

	// convolve kernel and the input
	conv( x, y ) = Halide::sum( knorm( rk.x, rk.y ) * input_32( x + rk.x, y + rk.y ) );

	// calculate normxcorr, except scaled to 0 to 254 (for an 8-bit image)
	normxcorr_internal( x, y ) = conv( x, y ) * 127 / Halide::sqrt( kernel_var * win_var( x, y ) ) + 127;

	// after scaling pixel values, it's safe to cast down to 8-bit
	output( x, y ) = Halide::cast<pixel_t>( normxcorr_internal( x, y ) );

	knorm.compute_root();
	input_32.compute_root();
	sum_x.compute_root();
	sum_y.compute_root();
	sq_sum_x.compute_root();
	sq_sum_y.compute_root();

	#if 0
	sum_x.trace_realizations();
	sum_y.trace_realizations();
	sq_sum_x.trace_realizations();
	sq_sum_y.trace_realizations();
	#endif

	#if 0
	sum_x.vectorize(x, 32).parallel(y);
	sum_y.vectorize(x, 32).parallel(y);
	sq_sum_x.vectorize(x, 32).parallel(y);
	sq_sum_y.vectorize(x, 32).parallel(y);
	#endif
	}

	int main (int argc, char const *argv[])
	{
	Halide::Func gen;
	Halide::ImageParam in(Halide::UInt(8), 2), k(Halide::UInt(8), 2);
	Halide::Param<pixel_t> kMean, kVar;

	normxcorr(in, k, kMean, kVar, gen);

	gen.compile_to_file("normxcorr", {in, k, kMean, kVar});

	return 0;
	}