Skip to content

Instantly share code, notes, and snippets.

@mmalex
Created March 4, 2020 14:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmalex/2b8a95f3c4d2b2e05d6339006ba7af2f to your computer and use it in GitHub Desktop.
Save mmalex/2b8a95f3c4d2b2e05d6339006ba7af2f to your computer and use it in GitHub Desktop.
quick timing test of different ways of writing a chain of serial allpasses
#include "stdafx.h"
#include <Windows.h> // for performance counter
// quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452
// we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths
// we time how long it takes to process 1 million samples, structuring the loop 3 ways:
// - a sample at a time with self contained allpass structures,
// - a sample at a time with a single big buffer
// - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer.
// using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015,
// I see
// 8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time
// on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it)
// I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest.
// obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they
// stack up against each other with similar levels of complexity & time lavished on them (ie not very much).
// notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on
// adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can
// pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced
// that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP.
template <int N> struct AllPass {
float buf[N]={};
int i=0;
inline float doit(float x) {
float delayed=buf[i];
buf[i] = x -= delayed * 0.5f;
if (++i == N) i=0;
return x * 0.5f + delayed;
}
};
// lengths of 6 allpasses
#define AP1 123
#define AP2 272
#define AP3 313
#define AP4 2040
#define AP5 4313
#define AP6 5916
AllPass<AP1> a1;
AllPass<AP2> a2;
AllPass<AP3> a3;
AllPass<AP4> a4;
AllPass<AP5> a5;
AllPass<AP6> a6;
#define MASK 16383
static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please");
static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please");
float buf[MASK+1];
int delaypos;
#define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed*0.5f;x=x*0.5f+delayed; i=j; }
inline float DoReverb1sampOneBuf(float x) {
int i=(delaypos--)&MASK; // this is the only index maintenance we need!
DoAllPass(AP1);
DoAllPass(AP2);
DoAllPass(AP3);
DoAllPass(AP4);
DoAllPass(AP5);
DoAllPass(AP6);
return x;
}
inline float DoReverb1samp(float x) {
x=a1.doit(x);
x=a2.doit(x);
x=a3.doit(x);
x=a4.doit(x);
x=a5.doit(x);
x=a6.doit(x);
return x;
}
const static int blocksize=256;
inline void DoReverbBlock(float *buf) {
for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]);
for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]);
for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]);
for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]);
for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]);
for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]);
}
inline float GetInput(int i) { // a click every 4096 samples
return (i&4095) ? 0 : 1.f;
}
int main()
{
while (1) {
LARGE_INTEGER freq,t0,t1;
__int64 time_1samp=0;
__int64 time_block=0;
__int64 time_1buf=0;
QueryPerformanceFrequency(&freq);
double toms=1000.0/(double)freq.QuadPart;
/////////////////////////////////////// sample at a time, separate allpass structures
float tot=0.f; // sum up a total so the optimiser doesnt throw away the work
QueryPerformanceCounter(&t0);
for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i));
QueryPerformanceCounter(&t1);
time_1samp+=t1.QuadPart-t0.QuadPart;
/////////////////////////////////////// sample at a time, one big buffer
float tot2=0.f;
QueryPerformanceCounter(&t0);
for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i));
QueryPerformanceCounter(&t1);
time_1buf+=t1.QuadPart-t0.QuadPart;
/////////////////////////////////////// block at a time, separate allpass structures
static float buf[blocksize];
float tot3=0.f;
QueryPerformanceCounter(&t0);
for (int i=0;i<1024*1024;i+=blocksize) {
for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j);
DoReverbBlock(buf);
for (int j=0;j<blocksize;++j) tot3+=buf[j];
}
QueryPerformanceCounter(&t1);
time_block+=t1.QuadPart-t0.QuadPart;
printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samp*toms, time_1buf*toms, time_block*toms);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment