rygorous/gist:c6831e60f5366569d2e9

## gistfile1.txt
// Conjugate split-radix FFT inner loop

// Both of these compiled with VC++ 2012, 32-bit, "/O2 /fp:fast".
// NOTE: I also tried clang-cl and it seems to be primarily a VC++
// problem. (Which doesn't help me much.)
//
// NOTE 2: argh, did the Clang test wrong. It's still primarily a
// VC++ problem, but Clang has a notable slowdown too. Anyway, new
// results generated automatically from a simpler standalone test
// where I can toggle between versions using a single commandline
// switch to prevent further mistakes.

// Compiler flags used:
// VC++ = VC++ 2012  /fp:fast /O2 /D_HAS_EXCEPTIONS=0 (no exceptions to be fair since clang_cl currently doesn't support them)
// Clang = clang-cl 3.5.0 /O3 /D_HAS_EXCEPTIONS=0
//
// This is computing FFTs on an input vector that is just 512
// 1s. Boring but an easy test.
//
// Code that VC++ 2012 outputs is here: https://gist.github.com/rygorous/a603c36d5b5288c96fb1

// ----- Variant 1: this uses
//
//   #include <complex>
//   typedef std::complex<float> complexf;

    for (size_t k = 0; k < N1; k++)
    {
        complexf Uk    = out0[k];
        complexf Uk_N1 = out1[k];
        complexf w     = twiddle[k];

        // Twiddle Zk, Z'k then butterfly
        complexf Zk    = w * out2[k];
        complexf Zpk   = std::conj(w) * out3[k];

        complexf Zsum = Zk + Zpk;
        complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk);

        out0[k] = Uk    + Zsum;
        out1[k] = Uk_N1 + Zdif;
        out2[k] = Uk    - Zsum;
        out3[k] = Uk_N1 - Zdif;
    }

// results for FFT: (stats over 1 million runs)

---- VC++ std::complex
Complex N=512, shortest=33477 cycles, avg 34521.76
Real    N=512, shortest=18073 cycles, avg 18424.41
---- Clang std::complex
Complex N=512, shortest=19988 cycles, avg 20576.95
Real    N=512, shortest=11027 cycles, avg 11682.85

// ----- Variant 2: this one just has a struct.
//
//   struct complexf { float re, im; };

    for (size_t k = 0; k < N1; k++)
    {
        complexf const &w = twiddle[k];
        complexf const &in2 = out2[k];
        complexf const &in3 = out3[k];

        float Zkr  = w.re*in2.re - w.im*in2.im;
        float Zki  = w.re*in2.im + w.im*in2.re;
        float Zpkr = w.re*in3.re + w.im*in3.im;
        float Zpki = w.re*in3.im - w.im*in3.re;

        float Zsumr = Zkr + Zpkr;
        float Zsumi = Zki + Zpki;
        float Zdifr = Zki - Zpki;
        float Zdifi = Zpkr - Zkr;

        out2[k].re = out0[k].re - Zsumr;
        out2[k].im = out0[k].im - Zsumi;
        out0[k].re += Zsumr;
        out0[k].im += Zsumi;
        out3[k].re = out1[k].re - Zdifr;
        out3[k].im = out1[k].im - Zdifi;
        out1[k].re += Zdifr;
        out1[k].im += Zdifi;
    }

// result for FFT: (stats over 1 million runs)

---- VC++ complexf
Complex N=512, shortest=12999 cycles, avg 13779.46
Real    N=512, shortest=8528 cycles, avg 9113.15
---- Clang complexf
Complex N=512, shortest=12101 cycles, avg 12782.80
Real    N=512, shortest=7234 cycles, avg 7652.84
	// Conjugate split-radix FFT inner loop

	// Both of these compiled with VC++ 2012, 32-bit, "/O2 /fp:fast".
	// NOTE: I also tried clang-cl and it seems to be primarily a VC++
	// problem. (Which doesn't help me much.)
	//
	// NOTE 2: argh, did the Clang test wrong. It's still primarily a
	// VC++ problem, but Clang has a notable slowdown too. Anyway, new
	// results generated automatically from a simpler standalone test
	// where I can toggle between versions using a single commandline
	// switch to prevent further mistakes.

	// Compiler flags used:
	// VC++ = VC++ 2012 /fp:fast /O2 /D_HAS_EXCEPTIONS=0 (no exceptions to be fair since clang_cl currently doesn't support them)
	// Clang = clang-cl 3.5.0 /O3 /D_HAS_EXCEPTIONS=0
	//
	// This is computing FFTs on an input vector that is just 512
	// 1s. Boring but an easy test.
	//
	// Code that VC++ 2012 outputs is here: https://gist.github.com/rygorous/a603c36d5b5288c96fb1

	// ----- Variant 1: this uses
	//
	// #include <complex>
	// typedef std::complex<float> complexf;

	for (size_t k = 0; k < N1; k++)
	{
	complexf Uk = out0[k];
	complexf Uk_N1 = out1[k];
	complexf w = twiddle[k];

	// Twiddle Zk, Z'k then butterfly
	complexf Zk = w * out2[k];
	complexf Zpk = std::conj(w) * out3[k];

	complexf Zsum = Zk + Zpk;
	complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk);

	out0[k] = Uk + Zsum;
	out1[k] = Uk_N1 + Zdif;
	out2[k] = Uk - Zsum;
	out3[k] = Uk_N1 - Zdif;
	}

	// results for FFT: (stats over 1 million runs)

	---- VC++ std::complex
	Complex N=512, shortest=33477 cycles, avg 34521.76
	Real N=512, shortest=18073 cycles, avg 18424.41
	---- Clang std::complex
	Complex N=512, shortest=19988 cycles, avg 20576.95
	Real N=512, shortest=11027 cycles, avg 11682.85

	// ----- Variant 2: this one just has a struct.
	//
	// struct complexf { float re, im; };

	for (size_t k = 0; k < N1; k++)
	{
	complexf const &w = twiddle[k];
	complexf const &in2 = out2[k];
	complexf const &in3 = out3[k];

	float Zkr = w.rein2.re - w.imin2.im;
	float Zki = w.rein2.im + w.imin2.re;
	float Zpkr = w.rein3.re + w.imin3.im;
	float Zpki = w.rein3.im - w.imin3.re;

	float Zsumr = Zkr + Zpkr;
	float Zsumi = Zki + Zpki;
	float Zdifr = Zki - Zpki;
	float Zdifi = Zpkr - Zkr;

	out2[k].re = out0[k].re - Zsumr;
	out2[k].im = out0[k].im - Zsumi;
	out0[k].re += Zsumr;
	out0[k].im += Zsumi;
	out3[k].re = out1[k].re - Zdifr;
	out3[k].im = out1[k].im - Zdifi;
	out1[k].re += Zdifr;
	out1[k].im += Zdifi;
	}

	// result for FFT: (stats over 1 million runs)

	---- VC++ complexf
	Complex N=512, shortest=12999 cycles, avg 13779.46
	Real N=512, shortest=8528 cycles, avg 9113.15
	---- Clang complexf
	Complex N=512, shortest=12101 cycles, avg 12782.80
	Real N=512, shortest=7234 cycles, avg 7652.84