Skip to content

Instantly share code, notes, and snippets.

@cyanreg
Last active January 27, 2021 02:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cyanreg/bbf25c8a8dfb910ed3b9ae7663983ca6 to your computer and use it in GitHub Desktop.
Save cyanreg/bbf25c8a8dfb910ed3b9ae7663983ca6 to your computer and use it in GitHub Desktop.
8-point FFT (permuted input)
static void fft8(FFTComplex *z)
{
FFTSample r1 = z[0].re - z[4].re;
FFTSample r2 = z[0].im - z[4].im;
FFTSample r3 = z[1].re - z[5].re;
FFTSample r4 = z[1].im - z[5].im;
FFTSample r5 = z[2].re - z[6].re;
FFTSample r6 = z[2].im - z[6].im;
FFTSample r7 = z[3].re - z[7].re;
FFTSample r8 = z[3].im - z[7].im;
FFTSample q1 = z[0].re + z[4].re;
FFTSample q2 = z[0].im + z[4].im;
FFTSample q3 = z[1].re + z[5].re;
FFTSample q4 = z[1].im + z[5].im;
FFTSample q5 = z[2].re + z[6].re;
FFTSample q6 = z[2].im + z[6].im;
FFTSample q7 = z[3].re + z[7].re;
FFTSample q8 = z[3].im + z[7].im;
FFTSample s3 = q1 - q3;
FFTSample s1 = q1 + q3;
FFTSample s4 = q2 - q4;
FFTSample s2 = q2 + q4;
FFTSample s7 = q5 - q7;
FFTSample s5 = q5 + q7;
FFTSample s8 = q6 - q8;
FFTSample s6 = q6 + q8;
FFTSample e1 = s1 * -1;
FFTSample e2 = s2 * -1;
FFTSample e3 = s3 * -1;
FFTSample e4 = s4 * -1;
FFTSample e5 = s5 * 1;
FFTSample e6 = s6 * 1;
FFTSample e7 = s7 * -1;
FFTSample e8 = s8 * 1;
FFTSample w1 = e5 - e1;
FFTSample w2 = e6 - e2;
FFTSample w3 = e8 - e3;
FFTSample w4 = e7 - e4;
FFTSample w5 = s1 - e5;
FFTSample w6 = s2 - e6;
FFTSample w7 = s3 - e8;
FFTSample w8 = s4 - e7;
z[0].re = w1;
z[0].im = w2;
z[2].re = w3;
z[2].im = w4;
z[4].re = w5;
z[4].im = w6;
z[6].re = w7;
z[6].im = w8;
FFTSample z1 = r1 - r4;
FFTSample z2 = r1 + r4;
FFTSample z3 = r3 - r2;
FFTSample z4 = r3 + r2;
FFTSample z5 = r5 - r6;
FFTSample z6 = r5 + r6;
FFTSample z7 = r7 - r8;
FFTSample z8 = r7 + r8;
z3 *= -1;
z5 *= -M_SQRT1_2;
z6 *= -M_SQRT1_2;
z7 *= M_SQRT1_2;
z8 *= M_SQRT1_2;
FFTSample t5 = z7 - z6;
FFTSample t6 = z8 + z5;
FFTSample t7 = z8 - z5;
FFTSample t8 = z7 + z6;
FFTSample u1 = z2 + t5;
FFTSample u2 = z3 + t6;
FFTSample u3 = z1 - t7;
FFTSample u4 = z4 + t8;
FFTSample u5 = z2 - t5;
FFTSample u6 = z3 - t6;
FFTSample u7 = z1 + t7;
FFTSample u8 = z4 - t8;
z[1].re = u1;
z[1].im = u2;
z[3].re = u3;
z[3].im = u4;
z[5].re = u5;
z[5].im = u6;
z[7].re = u7;
z[7].im = u8;
/* 11 instructions = 20 instructions total */
}
#if 0
; 8-point in-place complex FFT
; %1 - even coefficients
; %2 - odd coefficients
; %3 - temporary
; %4 - temporary
%macro FFT8_AVX 4
subps %3, %1, %2 ; r1, r2, r3, r4, r5, r6, r7, r8
addps %1, %2 ; q1, q2, q3, q4, q5, q6, q7, q8
vpermilps %2, %3, [perm8_odd1] ; r4, r4, r2, r2, r6, r6, r8, r8
shufps %4, %1, %1, q3322 ; q1, q1, q2, q2, q5, q5, q6, q6
movsldup %3, %3 ; r1, r1, r3, r3, r5, r5, r7, r7
shufps %1, %1, q1100 ; q3, q3, q4, q4, q7, q7, q8, q8
addsubps %3, %2 ; z1, z2, z3, z4, z5, z6, z7, z8
addsubps %1, %4 ; s3, s1, s4, s2, s7, s5, s8, s6
mulps %3, [mult8_odd] ; z * mult8_odd
vpermilps %1, [perm8_even] ; s1, s2, s3, s4, s5, s6, s8, s7
shufps %2, %3, %3, q2332 ; c, r, a, p, z7, z8, z8, z7
xorps %4, %1, [mask8_mmmmpppm] ; e1, e2, e3, e4, e5, e6, e8, e7
vpermilps %3, %3, [perm8_odd2] ; z2, z3, z1, z4, z6, z5, z5, z6
vperm2f128 %1, %4, q0003 ; e5, e6, e8, e7, s1, s2, s3, s4
addsubps %2, %3 ; c, r, a, p, t5, t6, t7, t8
subps %1, %4 ; w1, w2, w3, w4, w5, w6, w7, w8
vperm2f128 %2, %2, q0101 ; t5, t6, t7, t8, t5, t6, t7, t8
vperm2f128 %3, %3, q0000 ; z2, z3, z1, z4, z2, z3, z1, z4
xorps %2, [mask8_ppmpmmpm] ; t5, t6, -t7, t8, -t5, -t6, t7, -t8
addps %2, %3, %2 ; u1, u2, u3, u4, u5, u6, u7, u8
%endmacro
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment