x42/peak_calc.cc

## peak_calc.cc
// -- Linux / Intel --
// g++ -o peak_calc peak_calc.cc -Wall -mavx -lm -O3 -fopt-info && ./peak_calc
// g++ -o peak_calc peak_calc.cc -Wall -msse2 -lm -O3 -fopt-info && ./peak_calc
//
// -- Linux / ARM --
// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 && ./peak_calc
// g++ -o peak_calc peak_calc.cc -Wall -mfpu=neon-vfpv4 -lm -O3 && ./peak_calc
//
// -- macOS --
// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 -framework Accelerate

#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>

float
fallback_compute_peak (const float* const buf, uint32_t n_samples, float current)
{
	for (uint32_t i = 0; i < n_samples; ++i) {
		const float x = fabsf (buf[i]);
		if (x > current) {
			current = x;
		}
	}
	return current;
}

#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#define _HAVE_DSP_COMPUTE_PEAK

static float
dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
{
	float tmp = 0.0f;
	vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
	return fmaxf (current, tmp);
}

#elif (defined __aarch64__) || (defined __arm__)

#include <arm_acle.h>
#include <arm_neon.h>

#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
#define _HAVE_DSP_COMPUTE_PEAK

float
dsp_compute_peak (const float* src, uint32_t nframes, float current)
{
	float32x4_t vc0;

	// Broadcast single value to all elements of the register
	vc0 = vdupq_n_f32 (current);

	// While pointer is not aligned, process one sample at a time
	while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
		float32x4_t x0;

		x0  = vld1q_dup_f32 (src);
		x0  = vabsq_f32 (x0);
		vc0 = vmaxq_f32 (vc0, x0);

		++src;
		--nframes;
	}

	// SIMD portion with aligned buffers
	do {
		while (nframes >= 8) {
			float32x4_t x0, x1;

			x0 = vld1q_f32 (src + 0);
			x1 = vld1q_f32 (src + 4);

			x0 = vabsq_f32 (x0);
			x1 = vabsq_f32 (x1);

			vc0 = vmaxq_f32 (vc0, x0);
			vc0 = vmaxq_f32 (vc0, x1);

			src += 8;
			nframes -= 8;
		}

		while (nframes >= 4) {
			float32x4_t x0;

			x0 = vld1q_f32 (src);

			x0  = vabsq_f32 (x0);
			vc0 = vmaxq_f32 (vc0, x0);

			src += 4;
			nframes -= 4;
		}

		while (nframes >= 2) {
			float32x2_t x0;
			float32x4_t y0;

			x0 = vld1_f32 (src);        // Load two elements
			x0 = vabs_f32 (x0);         // Compute ABS value
			y0 = vcombine_f32 (x0, x0); // Combine two vectors

			vc0 = vmaxq_f32 (vc0, y0);

			src += 2;
			nframes -= 2;
		}
	} while (0);

	// Do remaining samples one frame at a time
	while (nframes > 0) {
		float32x4_t x0;

		x0  = vld1q_dup_f32 (src);
		x0  = vabsq_f32 (x0);
		vc0 = vmaxq_f32 (vc0, x0);

		++src;
		--nframes;
	}

	// Compute the max in register
	do {
		float32x2_t vlo  = vget_low_f32 (vc0);
		float32x2_t vhi  = vget_high_f32 (vc0);
		float32x2_t max0 = vpmax_f32 (vlo, vhi);
		float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
		current          = vget_lane_f32 (max1, 0);
	} while (0);

	return current;
}

#undef IS_ALIGNED_TO

#elif (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) // ARCH_X86

#include <immintrin.h>
#include <xmmintrin.h>

#ifdef __AVX__ // TODO runtime detect AVX
#define _HAVE_DSP_COMPUTE_PEAK

#warning using AVX, this limits available architectures

static float
dsp_compute_peak (const float* src, uint32_t n_samples, float current)
{
	const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);

	// Broadcast the current max value to all elements of the YMM register
	__m256 vmax = _mm256_broadcast_ss (&current);

	// Compute single min/max of unaligned portion until alignment is reached
	while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
		__m256 vsrc;

		vsrc = _mm256_setzero_ps ();
		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
		vmax = _mm256_max_ps (vmax, vsrc);

		++src;
		--n_samples;
	}

	// Process the aligned portion 16 samples at a time
	while (n_samples >= 16) {
#ifdef _WIN32
		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
#else
		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
#endif
		__m256 vsrc1, vsrc2;
		vsrc1 = _mm256_load_ps (src + 0);
		vsrc2 = _mm256_load_ps (src + 8);

		vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
		vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);

		vmax = _mm256_max_ps (vmax, vsrc1);
		vmax = _mm256_max_ps (vmax, vsrc2);

		src += 16;
		n_samples -= 16;
	}

	// Process the remaining samples 8 at a time
	while (n_samples >= 8) {
		__m256 vsrc;

		vsrc = _mm256_load_ps (src);
		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
		vmax = _mm256_max_ps (vmax, vsrc);

		src += 8;
		n_samples -= 8;
	}

	// If there are still some left 4 to 8 samples, process them below
	while (n_samples > 0) {
		__m256 vsrc;

		vsrc = _mm256_setzero_ps ();
		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
		vmax = _mm256_max_ps (vmax, vsrc);

		++src;
		--n_samples;
	}

	__m256 tmp;
	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp  = _mm256_permute2f128_ps (vmax, vmax, 1);
	vmax = _mm256_max_ps (tmp, vmax);

	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
	_mm256_zeroupper ();

#if defined(__GNUC__) && (__GNUC__ < 5)
	return *((float*)&vmax);
#elif defined(__GNUC__) && (__GNUC__ < 8)
	return vmax[0];
#else
	return _mm256_cvtss_f32 (vmax);
#endif
}

#elif defined __SSE2__
#define _HAVE_DSP_COMPUTE_PEAK

static float
dsp_compute_peak (const float* src, uint32_t n_samples, float current)
{
	const __m128 ABS_MASK = _mm_set1_ps (-0.0F);

	__m128 vmax;
	__m128 temp;

	vmax = _mm_set1_ps (current);

	// Compute single max of unaligned portion until alignment is reached
	while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
		temp = _mm_set1_ps (*src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		++src;
		--n_samples;
	}

	// use 64 byte prefetch for quadruple quads
	while (n_samples >= 16) {
#ifdef _WIN32
		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
#else
		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
#endif
		temp = _mm_load_ps (src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		src += 4;
		temp = _mm_load_ps (src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		src += 4;
		temp = _mm_load_ps (src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		src += 4;
		temp = _mm_load_ps (src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		src += 4;
		n_samples -= 16;
	}

	// temp through aligned buffers
	while (n_samples >= 4) {
		temp = _mm_load_ps (src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		src += 4;
		n_samples -= 4;
	}

	// temp through the rest < 4 samples
	while (n_samples > 0) {
		temp = _mm_set1_ps (*src);
		temp = _mm_andnot_ps (ABS_MASK, temp);
		vmax = _mm_max_ps (vmax, temp);
		++src;
		--n_samples;
	}

	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm_max_ps (temp, vmax);
	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm_max_ps (temp, vmax);

#if defined(__GNUC__) && (__GNUC__ < 5)
	return *((float*)&vmax);
#else
	return vmax[0];
#endif
}
#endif // SSE

#endif

#ifndef _HAVE_DSP_COMPUTE_PEAK
#warning dsp_compute_peak is not accelerated on this architecture

static float
dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
{
	for (uint32_t i = 0; i < n_samples; ++i) {
		const float x = fabsf (buf[i]);
		if (x > current) {
			current = x;
		}
	}
	return current;
}
#endif


#if (defined(__x86_64__) || defined(_M_X64))
static const int CPU_CACHE_ALIGN = 64;
#elif (defined __aarch64__) || (defined __arm__)
static const int CPU_CACHE_ALIGN = 128; // sizeof(float32x4_t)
#else
static const int CPU_CACHE_ALIGN = 16;
#endif

int
main ()
{
	float  pk;
	float* buf = NULL;
	int    n_samples = 1024;

	posix_memalign ((void**)&buf, CPU_CACHE_ALIGN, n_samples * sizeof (float));

#if 1
	srand (time (NULL));
	srand (5);

	for (int i = 0; i < n_samples; ++i) {
		buf[i] = (2.f * rand () / (float)RAND_MAX) - 1.f;
	}
#else
	memset (buf, 0, sizeof (float) * n_samples);
	buf[10] = .5;
#endif

	//buf = &buf[1]; --n_samples;

	struct timeval tv0, tv1;

#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
		pk = fallback_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("SIM %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
#endif

#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
		pk = dsp_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("OPT %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
#endif

	return 0;
}
	// -- Linux / Intel --
	// g++ -o peak_calc peak_calc.cc -Wall -mavx -lm -O3 -fopt-info && ./peak_calc
	// g++ -o peak_calc peak_calc.cc -Wall -msse2 -lm -O3 -fopt-info && ./peak_calc
	//
	// -- Linux / ARM --
	// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 && ./peak_calc
	// g++ -o peak_calc peak_calc.cc -Wall -mfpu=neon-vfpv4 -lm -O3 && ./peak_calc
	//
	// -- macOS --
	// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 -framework Accelerate

	#include <math.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/time.h>
	#include <time.h>

	float
	fallback_compute_peak (const float* const buf, uint32_t n_samples, float current)
	{
	for (uint32_t i = 0; i < n_samples; ++i) {
	const float x = fabsf (buf[i]);
	if (x > current) {
	current = x;
	}
	}
	return current;
	}

	#ifdef __APPLE__
	#include <Accelerate/Accelerate.h>
	#define _HAVE_DSP_COMPUTE_PEAK

	static float
	dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
	{
	float tmp = 0.0f;
	vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
	return fmaxf (current, tmp);
	}

	#elif (defined __aarch64__) \|\| (defined __arm__)

	#include <arm_acle.h>
	#include <arm_neon.h>

	#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
	#define _HAVE_DSP_COMPUTE_PEAK

	float
	dsp_compute_peak (const float* src, uint32_t nframes, float current)
	{
	float32x4_t vc0;

	// Broadcast single value to all elements of the register
	vc0 = vdupq_n_f32 (current);

	// While pointer is not aligned, process one sample at a time
	while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
	float32x4_t x0;

	x0 = vld1q_dup_f32 (src);
	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	++src;
	--nframes;
	}

	// SIMD portion with aligned buffers
	do {
	while (nframes >= 8) {
	float32x4_t x0, x1;

	x0 = vld1q_f32 (src + 0);
	x1 = vld1q_f32 (src + 4);

	x0 = vabsq_f32 (x0);
	x1 = vabsq_f32 (x1);

	vc0 = vmaxq_f32 (vc0, x0);
	vc0 = vmaxq_f32 (vc0, x1);

	src += 8;
	nframes -= 8;
	}

	while (nframes >= 4) {
	float32x4_t x0;

	x0 = vld1q_f32 (src);

	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	src += 4;
	nframes -= 4;
	}

	while (nframes >= 2) {
	float32x2_t x0;
	float32x4_t y0;

	x0 = vld1_f32 (src); // Load two elements
	x0 = vabs_f32 (x0); // Compute ABS value
	y0 = vcombine_f32 (x0, x0); // Combine two vectors

	vc0 = vmaxq_f32 (vc0, y0);

	src += 2;
	nframes -= 2;
	}
	} while (0);

	// Do remaining samples one frame at a time
	while (nframes > 0) {
	float32x4_t x0;

	x0 = vld1q_dup_f32 (src);
	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	++src;
	--nframes;
	}

	// Compute the max in register
	do {
	float32x2_t vlo = vget_low_f32 (vc0);
	float32x2_t vhi = vget_high_f32 (vc0);
	float32x2_t max0 = vpmax_f32 (vlo, vhi);
	float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
	current = vget_lane_f32 (max1, 0);
	} while (0);

	return current;
	}

	#undef IS_ALIGNED_TO

	#elif (defined __x86_64__) \|\| (defined __i386__) \|\| (defined _M_X64) \|\| (defined _M_IX86) // ARCH_X86

	#include <immintrin.h>
	#include <xmmintrin.h>

	#ifdef __AVX__ // TODO runtime detect AVX
	#define _HAVE_DSP_COMPUTE_PEAK

	#warning using AVX, this limits available architectures

	static float
	dsp_compute_peak (const float* src, uint32_t n_samples, float current)
	{
	const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);

	// Broadcast the current max value to all elements of the YMM register
	__m256 vmax = _mm256_broadcast_ss (&current);

	// Compute single min/max of unaligned portion until alignment is reached
	while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
	__m256 vsrc;

	vsrc = _mm256_setzero_ps ();
	vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	++src;
	--n_samples;
	}

	// Process the aligned portion 16 samples at a time
	while (n_samples >= 16) {
	#ifdef _WIN32
	_mm_prefetch (((char)src + (16 sizeof (float))), _mm_hint (0));
	#else
	__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
	#endif
	__m256 vsrc1, vsrc2;
	vsrc1 = _mm256_load_ps (src + 0);
	vsrc2 = _mm256_load_ps (src + 8);

	vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
	vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);

	vmax = _mm256_max_ps (vmax, vsrc1);
	vmax = _mm256_max_ps (vmax, vsrc2);

	src += 16;
	n_samples -= 16;
	}

	// Process the remaining samples 8 at a time
	while (n_samples >= 8) {
	__m256 vsrc;

	vsrc = _mm256_load_ps (src);
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	src += 8;
	n_samples -= 8;
	}

	// If there are still some left 4 to 8 samples, process them below
	while (n_samples > 0) {
	__m256 vsrc;

	vsrc = _mm256_setzero_ps ();
	vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	++src;
	--n_samples;
	}

	__m256 tmp;
	tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp = _mm256_permute2f128_ps (vmax, vmax, 1);
	vmax = _mm256_max_ps (tmp, vmax);

	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
	_mm256_zeroupper ();

	#if defined(__GNUC__) && (__GNUC__ < 5)
	return ((float)&vmax);
	#elif defined(__GNUC__) && (__GNUC__ < 8)
	return vmax[0];
	#else
	return _mm256_cvtss_f32 (vmax);
	#endif
	}

	#elif defined __SSE2__
	#define _HAVE_DSP_COMPUTE_PEAK

	static float
	dsp_compute_peak (const float* src, uint32_t n_samples, float current)
	{
	const __m128 ABS_MASK = _mm_set1_ps (-0.0F);

	__m128 vmax;
	__m128 temp;

	vmax = _mm_set1_ps (current);

	// Compute single max of unaligned portion until alignment is reached
	while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
	temp = _mm_set1_ps (*src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	++src;
	--n_samples;
	}

	// use 64 byte prefetch for quadruple quads
	while (n_samples >= 16) {
	#ifdef _WIN32
	_mm_prefetch (((char)src + (16 sizeof (float))), _mm_hint (0));
	#else
	__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
	#endif
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	n_samples -= 16;
	}

	// temp through aligned buffers
	while (n_samples >= 4) {
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	n_samples -= 4;
	}

	// temp through the rest < 4 samples
	while (n_samples > 0) {
	temp = _mm_set1_ps (*src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	++src;
	--n_samples;
	}

	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm_max_ps (temp, vmax);
	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm_max_ps (temp, vmax);

	#if defined(__GNUC__) && (__GNUC__ < 5)
	return ((float)&vmax);
	#else
	return vmax[0];
	#endif
	}
	#endif // SSE

	#endif

	#ifndef _HAVE_DSP_COMPUTE_PEAK
	#warning dsp_compute_peak is not accelerated on this architecture

	static float
	dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
	{
	for (uint32_t i = 0; i < n_samples; ++i) {
	const float x = fabsf (buf[i]);
	if (x > current) {
	current = x;
	}
	}
	return current;
	}
	#endif


	#if (defined(__x86_64__) \|\| defined(_M_X64))
	static const int CPU_CACHE_ALIGN = 64;
	#elif (defined __aarch64__) \|\| (defined __arm__)
	static const int CPU_CACHE_ALIGN = 128; // sizeof(float32x4_t)
	#else
	static const int CPU_CACHE_ALIGN = 16;
	#endif

	int
	main ()
	{
	float pk;
	float* buf = NULL;
	int n_samples = 1024;

	posix_memalign ((void*)&buf, CPU_CACHE_ALIGN, n_samples sizeof (float));

	#if 1
	srand (time (NULL));
	srand (5);

	for (int i = 0; i < n_samples; ++i) {
	buf[i] = (2.f * rand () / (float)RAND_MAX) - 1.f;
	}
	#else
	memset (buf, 0, sizeof (float) * n_samples);
	buf[10] = .5;
	#endif

	//buf = &buf[1]; --n_samples;

	struct timeval tv0, tv1;

	#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
	pk = fallback_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("SIM %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
	#endif

	#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
	pk = dsp_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("OPT %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
	#endif

	return 0;
	}