mmalex/allpasstest.cpp

## allpasstest.cpp
#include "stdafx.h"
#include <Windows.h> // for performance counter
// quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452
// we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths
// we time how long it takes to process 1 million samples, structuring the loop 3 ways:
//	- a sample at a time with self contained allpass structures,
//  - a sample at a time with a single big buffer
//  - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer.
// using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015,
// I see
//		8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time
// on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it)
// I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest.
// obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they
// stack up against each other with similar levels of complexity & time lavished on them (ie not very much).
// notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on
// adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can
// pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced
// that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP.
template <int N> struct AllPass {
	float buf[N]={};
	int i=0;
	inline float doit(float x) {
		float delayed=buf[i];
		buf[i] = x -= delayed * 0.5f;
		if (++i == N) i=0;
		return x * 0.5f + delayed;
	}
};
// lengths of 6 allpasses
#define AP1 123
#define AP2 272
#define AP3 313
#define AP4 2040
#define AP5 4313
#define AP6 5916
AllPass<AP1> a1;
AllPass<AP2> a2;
AllPass<AP3> a3;
AllPass<AP4> a4;
AllPass<AP5> a5;
AllPass<AP6> a6;
#define MASK 16383
static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please");
static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please");
float buf[MASK+1];
int delaypos;

#define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed*0.5f;x=x*0.5f+delayed; i=j; }
inline float DoReverb1sampOneBuf(float x) {
	int i=(delaypos--)&MASK; // this is the only index maintenance we need!
	DoAllPass(AP1);
	DoAllPass(AP2);
	DoAllPass(AP3);
	DoAllPass(AP4);
	DoAllPass(AP5);
	DoAllPass(AP6);
	return x;
}
inline float DoReverb1samp(float x) {
	x=a1.doit(x);
	x=a2.doit(x);
	x=a3.doit(x);
	x=a4.doit(x);
	x=a5.doit(x);
	x=a6.doit(x);
	return x;
}
const static int blocksize=256;
inline void DoReverbBlock(float *buf) {
	for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]);
}
inline float GetInput(int i) { // a click every 4096 samples
	return (i&4095) ? 0 : 1.f;
}
int main()
{
	while (1) {
		LARGE_INTEGER freq,t0,t1;
		__int64 time_1samp=0;
		__int64 time_block=0;
		__int64 time_1buf=0;
		QueryPerformanceFrequency(&freq);
		double toms=1000.0/(double)freq.QuadPart;

		/////////////////////////////////////// sample at a time, separate allpass structures
		float tot=0.f; // sum up a total so the optimiser doesnt throw away the work
		QueryPerformanceCounter(&t0);
		for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i));
		QueryPerformanceCounter(&t1);
		time_1samp+=t1.QuadPart-t0.QuadPart;

		/////////////////////////////////////// sample at a time, one big buffer
		float tot2=0.f;
		QueryPerformanceCounter(&t0);
		for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i));
		QueryPerformanceCounter(&t1);
		time_1buf+=t1.QuadPart-t0.QuadPart;

		/////////////////////////////////////// block at a time, separate allpass structures
		static float buf[blocksize];
		float tot3=0.f;
		QueryPerformanceCounter(&t0);
		for (int i=0;i<1024*1024;i+=blocksize) {
			for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j);
			DoReverbBlock(buf);
			for (int j=0;j<blocksize;++j) tot3+=buf[j];
		}
		QueryPerformanceCounter(&t1);
		time_block+=t1.QuadPart-t0.QuadPart;

		printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samp*toms, time_1buf*toms, time_block*toms);
	}
    return 0;
}
	#include "stdafx.h"
	#include <Windows.h> // for performance counter
	// quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452
	// we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths
	// we time how long it takes to process 1 million samples, structuring the loop 3 ways:
	// - a sample at a time with self contained allpass structures,
	// - a sample at a time with a single big buffer
	// - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer.
	// using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015,
	// I see
	// 8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time
	// on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it)
	// I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest.
	// obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they
	// stack up against each other with similar levels of complexity & time lavished on them (ie not very much).
	// notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on
	// adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can
	// pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced
	// that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP.
	template <int N> struct AllPass {
	float buf[N]={};
	int i=0;
	inline float doit(float x) {
	float delayed=buf[i];
	buf[i] = x -= delayed * 0.5f;
	if (++i == N) i=0;
	return x * 0.5f + delayed;
	}
	};
	// lengths of 6 allpasses
	#define AP1 123
	#define AP2 272
	#define AP3 313
	#define AP4 2040
	#define AP5 4313
	#define AP6 5916
	AllPass<AP1> a1;
	AllPass<AP2> a2;
	AllPass<AP3> a3;
	AllPass<AP4> a4;
	AllPass<AP5> a5;
	AllPass<AP6> a6;
	#define MASK 16383
	static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please");
	static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please");
	float buf[MASK+1];
	int delaypos;

	#define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed0.5f;x=x0.5f+delayed; i=j; }
	inline float DoReverb1sampOneBuf(float x) {
	int i=(delaypos--)&MASK; // this is the only index maintenance we need!
	DoAllPass(AP1);
	DoAllPass(AP2);
	DoAllPass(AP3);
	DoAllPass(AP4);
	DoAllPass(AP5);
	DoAllPass(AP6);
	return x;
	}
	inline float DoReverb1samp(float x) {
	x=a1.doit(x);
	x=a2.doit(x);
	x=a3.doit(x);
	x=a4.doit(x);
	x=a5.doit(x);
	x=a6.doit(x);
	return x;
	}
	const static int blocksize=256;
	inline void DoReverbBlock(float *buf) {
	for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]);
	}
	inline float GetInput(int i) { // a click every 4096 samples
	return (i&4095) ? 0 : 1.f;
	}
	int main()
	{
	while (1) {
	LARGE_INTEGER freq,t0,t1;
	__int64 time_1samp=0;
	__int64 time_block=0;
	__int64 time_1buf=0;
	QueryPerformanceFrequency(&freq);
	double toms=1000.0/(double)freq.QuadPart;

	/////////////////////////////////////// sample at a time, separate allpass structures
	float tot=0.f; // sum up a total so the optimiser doesnt throw away the work
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i));
	QueryPerformanceCounter(&t1);
	time_1samp+=t1.QuadPart-t0.QuadPart;

	/////////////////////////////////////// sample at a time, one big buffer
	float tot2=0.f;
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i));
	QueryPerformanceCounter(&t1);
	time_1buf+=t1.QuadPart-t0.QuadPart;

	/////////////////////////////////////// block at a time, separate allpass structures
	static float buf[blocksize];
	float tot3=0.f;
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;i+=blocksize) {
	for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j);
	DoReverbBlock(buf);
	for (int j=0;j<blocksize;++j) tot3+=buf[j];
	}
	QueryPerformanceCounter(&t1);
	time_block+=t1.QuadPart-t0.QuadPart;

	printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samptoms, time_1buftoms, time_block*toms);
	}
	return 0;
	}