Skip to content

Instantly share code, notes, and snippets.

@cyanreg
Created March 9, 2021 03:17
Show Gist options
  • Save cyanreg/3df2c0b272d47c3ee0d62124542c828c to your computer and use it in GitHub Desktop.
Save cyanreg/3df2c0b272d47c3ee0d62124542c828c to your computer and use it in GitHub Desktop.
8-point FFT (2x at a time)
static void fft8(void *s, FFTComplex *z, FFTComplex *temp)
{
FFTSample r1 = z[0].re - z[4].re;
FFTSample r2 = z[0].im - z[4].im;
FFTSample r3 = z[1].re - z[5].re;
FFTSample r4 = z[1].im - z[5].im;
FFTSample j1 = z[2].re - z[6].re;
FFTSample j2 = z[2].im - z[6].im;
FFTSample j3 = z[3].re - z[7].re;
FFTSample j4 = z[3].im - z[7].im;
FFTSample q1 = z[0].re + z[4].re;
FFTSample q2 = z[0].im + z[4].im;
FFTSample q3 = z[1].re + z[5].re;
FFTSample q4 = z[1].im + z[5].im;
FFTSample k1 = z[2].re + z[6].re;
FFTSample k2 = z[2].im + z[6].im;
FFTSample k3 = z[3].re + z[7].re;
FFTSample k4 = z[3].im + z[7].im;
/* 2 add 2 sub = 4 */
/* 2 shufs, 1 add 1 sub = 4 */
FFTSample s1 = q1 + q3;
FFTSample s2 = q2 + q4;
FFTSample g1 = k3 + k1;
FFTSample g2 = k2 + k4;
FFTSample s3 = q1 - q3;
FFTSample s4 = q2 - q4;
FFTSample g4 = k3 - k1;
FFTSample g3 = k2 - k4;
/* 1 unpack + 1 shuffle = 2 */
/* 1 add */
FFTSample w1 = s1 + g1;
FFTSample w2 = s2 + g2;
FFTSample w3 = s3 + g3;
FFTSample w4 = s4 + g4;
/* 1 sub */
FFTSample h1 = s1 - g1;
FFTSample h2 = s2 - g2;
FFTSample h3 = s3 - g3;
FFTSample h4 = s4 - g4;
// 12 unhappy
z[0].re = w1;
z[0].im = w2;
z[2].re = w3;
z[2].im = w4;
z[4].re = h1;
z[4].im = h2;
z[6].re = h3;
z[6].im = h4;
/* 1 shuf + 1 shuf + 1 xor + 1 addsub */
FFTSample z1 = r1 + r4;
FFTSample z2 = r2 - r3;
FFTSample z3 = r1 - r4;
FFTSample z4 = r2 + r3;
/* 1 mult */
j1 *= M_SQRT1_2;
j2 *= -M_SQRT1_2;
j3 *= -M_SQRT1_2;
j4 *= M_SQRT1_2;
/* 1 shuf + 1 addsub */
FFTSample l2 = j1 - j2;
FFTSample l1 = j2 + j1;
FFTSample l4 = j3 - j4;
FFTSample l3 = j4 + j3;
/* 1 shuf + 1 addsub */
FFTSample t1 = l3 - l2;
FFTSample t2 = l4 + l1;
FFTSample t3 = l1 - l4;
FFTSample t4 = l2 + l3;
/* 1 add */
FFTSample u1 = z1 - t1;
FFTSample u2 = z2 - t2;
FFTSample u3 = z3 - t3;
FFTSample u4 = z4 - t4;
/* 1 sub */
FFTSample o1 = z1 + t1;
FFTSample o2 = z2 + t2;
FFTSample o3 = z3 + t3;
FFTSample o4 = z4 + t4;
// 11
z[1].re = u1;
z[1].im = u2;
z[3].re = u3;
z[3].im = u4;
z[5].re = o1;
z[5].im = o2;
z[7].re = o3;
z[7].im = o4;
}
#if 0
; Single 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim])
; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim])
; %5 - temporary
; %6 - temporary
%macro FFT8 6
addps %5, %1, %3 ; q1-8
addps %6, %2, %4 ; k1-8
subps %1, %3 ; r1-8
subps %2, %4 ; j1-8
shufps %4, %1, %1, q2323 ; r4343
shufps %3, %5, %6, q3032 ; q34, k14
shufps %1, %1, q1010 ; r1212
shufps %5, %6, q1210 ; q12, k32
xorps %4, [mask_pmmppmmp] ; r4343 * pmmp
addps %6, %5, %3 ; s12, g12
mulps %2, [d8_mult_odd] ; r8 * d8_mult_odd
subps %5, %3 ; s34, g43
addps %3, %1, %4 ; z1234
unpcklpd %1, %6, %5 ; s1234
shufps %4, %2, %2, q2301 ; j2143
shufps %6, %5, q2332 ; g1234
addsubps %2, %4 ; l2143
shufps %5, %2, %2, q0123 ; l3412 awful 4-instruction dep chain
addsubps %5, %2 ; t1234
subps %2, %1, %6 ; h1234
subps %4, %3, %5 ; u1234
addps %1, %6 ; w1234
addps %3, %5 ; o1234
%endmacro
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment