Code output by VC++2012 for both versions in https://gist.github.com/rygorous/c6831e60f5366569d2e9
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ; ---- std::complex version | |
| ; 301 : | |
| ; 302 : for (size_t k = 0; k < N1; k++) | |
| test edi, edi | |
| je $LN1@ffts_rec | |
| mov edi, edx | |
| sub edi, eax | |
| mov DWORD PTR tv1343[esp+112], edi | |
| mov edi, DWORD PTR _out1$1$[esp+108] | |
| sub esi, ecx | |
| sub edi, ecx | |
| sub edx, ecx | |
| sub eax, ecx | |
| mov DWORD PTR _twiddle$1$[esp+112], esi | |
| mov DWORD PTR _out1$1$[esp+108], edi | |
| mov DWORD PTR _out2$1$[esp+112], edx | |
| mov DWORD PTR _out3$1$[esp+112], eax | |
| $LL3@ffts_rec: | |
| ; 303 : { | |
| ; 304 : #if 1 | |
| ; 305 : complexf Uk = out0[k]; | |
| mov edx, DWORD PTR [ecx+4] | |
| ; 306 : complexf Uk_N1 = out1[k]; | |
| mov eax, DWORD PTR [edi+ecx] | |
| mov DWORD PTR _Uk_N1$2$[esp+112], eax | |
| mov DWORD PTR _Uk_N1$2[esp+112], eax | |
| mov DWORD PTR _Uk$2$[esp+112], edx | |
| ; 307 : complexf w = twiddle[k]; | |
| mov edx, DWORD PTR _twiddle$1$[esp+112] | |
| mov esi, DWORD PTR [ecx] | |
| mov eax, DWORD PTR [edx+ecx] | |
| mov DWORD PTR _w$5[esp+112], eax | |
| mov eax, DWORD PTR [edx+ecx+4] | |
| mov DWORD PTR _w$5[esp+116], eax | |
| ; 308 : | |
| ; 309 : // Twiddle Zk, Z'k then butterfly | |
| ; 310 : complexf Zk = w * out2[k]; | |
| mov eax, DWORD PTR [edx+ecx] | |
| ; 311 : complexf Zpk = std::conj(w) * out3[k]; | |
| movss xmm5, DWORD PTR _w$5[esp+116] | |
| xorps xmm5, DWORD PTR __xmm@80000000800000008000000080000000 | |
| mov DWORD PTR __Tmp$10[esp+112], eax | |
| mov eax, DWORD PTR [edx+ecx+4] | |
| movss xmm3, DWORD PTR __Tmp$10[esp+112] | |
| mov edx, DWORD PTR tv1343[esp+112] | |
| mov DWORD PTR __Tmp$10[esp+116], eax | |
| mov eax, DWORD PTR _out2$1$[esp+112] | |
| movss xmm1, DWORD PTR __Tmp$10[esp+116] | |
| movss xmm4, DWORD PTR [eax+ecx] | |
| mov eax, DWORD PTR _out3$1$[esp+112] | |
| add eax, ecx | |
| movaps xmm0, xmm1 | |
| mulss xmm0, DWORD PTR [eax+edx+4] | |
| mulss xmm1, xmm4 | |
| movaps xmm7, xmm3 | |
| mulss xmm3, DWORD PTR [eax+edx+4] | |
| mov edx, DWORD PTR _out3$1$[esp+112] | |
| mulss xmm7, xmm4 | |
| addss xmm3, xmm1 | |
| movss xmm1, DWORD PTR _w$5[esp+112] | |
| subss xmm7, xmm0 | |
| mov edi, DWORD PTR [edi+ecx+4] | |
| movaps xmm4, xmm1 | |
| mulss xmm4, DWORD PTR [edx+ecx] | |
| mulss xmm1, DWORD PTR [eax+4] | |
| movaps xmm0, xmm5 | |
| mulss xmm0, DWORD PTR [eax+4] | |
| mulss xmm5, DWORD PTR [edx+ecx] | |
| subss xmm4, xmm0 | |
| movss DWORD PTR __Tmp$10[esp+116], xmm3 | |
| ; 312 : | |
| ; 313 : complexf Zsum = Zk + Zpk; | |
| mov eax, DWORD PTR __Tmp$10[esp+116] | |
| addss xmm5, xmm1 | |
| movss DWORD PTR __Tmp$10[esp+112], xmm7 | |
| mov ecx, DWORD PTR __Tmp$10[esp+112] | |
| ; 314 : complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk); | |
| mov DWORD PTR __Tmp$3[esp+116], eax | |
| xorps xmm1, xmm1 | |
| mov DWORD PTR __Tmp$4[esp+112], ecx | |
| mov DWORD PTR __Tmp$4[esp+116], eax | |
| movss xmm6, DWORD PTR __Tmp$4[esp+116] | |
| movaps xmm3, xmm4 | |
| addss xmm3, xmm7 | |
| subss xmm7, xmm4 | |
| movss xmm4, DWORD PTR __Tmp$3[esp+116] | |
| subss xmm4, xmm5 | |
| mov DWORD PTR __Tmp$3[esp+112], ecx | |
| ; 315 : | |
| ; 316 : out0[k] = Uk + Zsum; | |
| mov ecx, DWORD PTR _out$[esp+108] | |
| movaps xmm2, xmm7 | |
| mov eax, DWORD PTR [ecx] | |
| mulss xmm7, DWORD PTR __real@3f800000 | |
| movaps xmm0, xmm4 | |
| mulss xmm0, DWORD PTR __real@bf800000 | |
| mulss xmm2, xmm1 | |
| mov DWORD PTR __Tmp$9[esp+112], eax | |
| mov eax, DWORD PTR [ecx+4] | |
| subss xmm2, xmm0 | |
| movss xmm0, DWORD PTR __Tmp$9[esp+112] | |
| addss xmm0, xmm3 | |
| addss xmm6, xmm5 | |
| mov DWORD PTR __Tmp$9[esp+116], eax | |
| mulss xmm4, xmm1 | |
| movss DWORD PTR __Tmp$9[esp+112], xmm0 | |
| movss xmm0, DWORD PTR __Tmp$9[esp+116] | |
| mov eax, DWORD PTR __Tmp$9[esp+112] | |
| addss xmm0, xmm6 | |
| mov DWORD PTR _Uk$1[esp+112], esi | |
| subss xmm4, xmm7 | |
| mov DWORD PTR [ecx], eax | |
| movss DWORD PTR __Tmp$9[esp+116], xmm0 | |
| ; 317 : out1[k] = Uk_N1 + Zdif; | |
| movss xmm1, DWORD PTR _Uk_N1$2[esp+112] | |
| mov eax, DWORD PTR __Tmp$9[esp+116] | |
| mov edx, DWORD PTR _out1$1$[esp+108] | |
| mov DWORD PTR [ecx+4], eax | |
| mov ecx, DWORD PTR _Uk_N1$2$[esp+112] | |
| movaps xmm0, xmm1 | |
| addss xmm0, xmm2 | |
| mov DWORD PTR __Tmp$8[esp+112], ecx | |
| mov ecx, DWORD PTR _out$[esp+108] | |
| mov DWORD PTR __Tmp$8[esp+116], edi | |
| ; 318 : out2[k] = Uk - Zsum; | |
| mov DWORD PTR __Tmp$7[esp+112], esi | |
| movss DWORD PTR __Tmp$8[esp+112], xmm0 | |
| movss xmm0, DWORD PTR __Tmp$8[esp+116] | |
| mov eax, DWORD PTR __Tmp$8[esp+112] | |
| addss xmm0, xmm4 | |
| mov DWORD PTR [edx+ecx], eax | |
| ; 319 : out3[k] = Uk_N1 - Zdif; | |
| mov DWORD PTR __Tmp$6[esp+116], edi | |
| mov edi, DWORD PTR _out1$1$[esp+108] | |
| subss xmm1, xmm2 | |
| movss DWORD PTR __Tmp$8[esp+116], xmm0 | |
| movss xmm0, DWORD PTR _Uk$1[esp+112] | |
| mov eax, DWORD PTR __Tmp$8[esp+116] | |
| mov DWORD PTR [edx+ecx+4], eax | |
| mov edx, DWORD PTR _Uk$2$[esp+112] | |
| subss xmm0, xmm3 | |
| mov DWORD PTR __Tmp$7[esp+116], edx | |
| mov edx, ecx | |
| mov ecx, DWORD PTR _out2$1$[esp+112] | |
| movss DWORD PTR __Tmp$7[esp+112], xmm0 | |
| movss xmm0, DWORD PTR __Tmp$7[esp+116] | |
| mov eax, DWORD PTR __Tmp$7[esp+112] | |
| subss xmm0, xmm6 | |
| mov DWORD PTR [ecx+edx], eax | |
| movss DWORD PTR __Tmp$7[esp+116], xmm0 | |
| mov eax, DWORD PTR __Tmp$7[esp+116] | |
| movss xmm0, DWORD PTR __Tmp$6[esp+116] | |
| mov DWORD PTR [ecx+edx+4], eax | |
| mov ecx, DWORD PTR _Uk_N1$2$[esp+112] | |
| mov DWORD PTR __Tmp$6[esp+112], ecx | |
| mov ecx, edx | |
| mov edx, DWORD PTR _out3$1$[esp+112] | |
| subss xmm0, xmm4 | |
| movss DWORD PTR __Tmp$6[esp+112], xmm1 | |
| mov eax, DWORD PTR __Tmp$6[esp+112] | |
| mov DWORD PTR [edx+ecx], eax | |
| add ecx, 8 | |
| dec DWORD PTR _N$[esp+108] | |
| movss DWORD PTR __Tmp$6[esp+116], xmm0 | |
| mov eax, DWORD PTR __Tmp$6[esp+116] | |
| mov DWORD PTR [edx+ecx-4], eax | |
| mov DWORD PTR _out$[esp+108], ecx | |
| jne $LL3@ffts_rec | |
| $LN1@ffts_rec: | |
| pop edi | |
| pop esi | |
| ; --------------------- | |
| ; ---- struct version: | |
| ; 301 : | |
| ; 302 : for (size_t k = 0; k < N1; k++) | |
| test edi, edi | |
| je $LN1@ffts_rec | |
| mov esi, ebx | |
| sub esi, eax | |
| mov DWORD PTR tv840[esp+28], esi | |
| mov esi, DWORD PTR _out$[esp+24] | |
| mov DWORD PTR tv838[esp+24], esi | |
| sub DWORD PTR tv838[esp+24], eax | |
| mov esi, ebp | |
| sub esi, eax | |
| mov DWORD PTR tv836[esp+28], esi | |
| mov esi, edx | |
| sub esi, eax | |
| mov DWORD PTR tv834[esp+28], esi | |
| mov esi, DWORD PTR _out$[esp+24] | |
| sub ebx, esi | |
| sub ebp, esi | |
| sub edx, esi | |
| lea ecx, DWORD PTR [eax+4] | |
| mov eax, DWORD PTR tv838[esp+24] | |
| mov DWORD PTR _out1$1$[esp+24], edx | |
| npad 13 | |
| $LL3@ffts_rec: | |
| ; 303 : { | |
| ; 304 : #if 0 | |
| ; 305 : complexf Uk = out0[k]; | |
| ; 306 : complexf Uk_N1 = out1[k]; | |
| ; 307 : complexf w = twiddle[k]; | |
| ; 308 : | |
| ; 309 : // Twiddle Zk, Z'k then butterfly | |
| ; 310 : complexf Zk = w * out2[k]; | |
| ; 311 : complexf Zpk = std::conj(w) * out3[k]; | |
| ; 312 : | |
| ; 313 : complexf Zsum = Zk + Zpk; | |
| ; 314 : complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk); | |
| ; 315 : | |
| ; 316 : out0[k] = Uk + Zsum; | |
| ; 317 : out1[k] = Uk_N1 + Zdif; | |
| ; 318 : out2[k] = Uk - Zsum; | |
| ; 319 : out3[k] = Uk_N1 - Zdif; | |
| ; 320 : #else | |
| ; 321 : complexf const &w = twiddle[k]; | |
| ; 322 : complexf const &in2 = out2[k]; | |
| ; 323 : complexf const &in3 = out3[k]; | |
| ; 324 : | |
| ; 325 : float Zkr = w.re*in2.re - w.im*in2.im; | |
| movss xmm5, DWORD PTR [ecx-4] | |
| movss xmm2, DWORD PTR [ebx+esi] | |
| movss xmm3, DWORD PTR [ecx] | |
| mov edx, DWORD PTR tv840[esp+28] | |
| ; 326 : float Zki = w.re*in2.im + w.im*in2.re; | |
| ; 327 : float Zpkr = w.re*in3.re + w.im*in3.im; | |
| mov edi, DWORD PTR tv836[esp+28] | |
| movss xmm0, DWORD PTR [edx+ecx] | |
| mulss xmm0, xmm3 | |
| movaps xmm4, xmm2 | |
| mulss xmm2, xmm3 | |
| movaps xmm7, xmm5 | |
| mulss xmm7, DWORD PTR [edx+ecx] | |
| mulss xmm4, xmm5 | |
| addss xmm7, xmm2 | |
| movss xmm2, DWORD PTR [esi+ebp] | |
| subss xmm4, xmm0 | |
| movss xmm0, DWORD PTR [edi+ecx] | |
| mulss xmm0, xmm3 | |
| movaps xmm6, xmm2 | |
| mulss xmm6, xmm5 | |
| ; 328 : float Zpki = w.re*in3.im - w.im*in3.re; | |
| mulss xmm5, DWORD PTR [edi+ecx] | |
| addss xmm6, xmm0 | |
| ; 329 : | |
| ; 330 : float Zsumr = Zkr + Zpkr; | |
| ; 331 : float Zsumi = Zki + Zpki; | |
| ; 332 : float Zdifr = Zki - Zpki; | |
| ; 333 : float Zdifi = Zpkr - Zkr; | |
| ; 334 : | |
| ; 335 : out2[k].re = out0[k].re - Zsumr; | |
| movss xmm0, DWORD PTR [esi] | |
| mulss xmm2, xmm3 | |
| add ecx, 8 | |
| movaps xmm1, xmm6 | |
| subss xmm5, xmm2 | |
| addss xmm1, xmm4 | |
| subss xmm6, xmm4 | |
| add esi, 8 | |
| dec DWORD PTR _N$[esp+24] | |
| subss xmm0, xmm1 | |
| movss DWORD PTR [ebx+esi-8], xmm0 | |
| ; 336 : out2[k].im = out0[k].im - Zsumi; | |
| movss xmm0, DWORD PTR [eax+ecx-8] | |
| movaps xmm2, xmm5 | |
| addss xmm2, xmm7 | |
| subss xmm7, xmm5 | |
| subss xmm0, xmm2 | |
| movss DWORD PTR [edx+ecx-8], xmm0 | |
| ; 337 : out0[k].re += Zsumr; | |
| movss xmm0, DWORD PTR [esi-8] | |
| ; 338 : out0[k].im += Zsumi; | |
| addss xmm2, DWORD PTR [eax+ecx-8] | |
| ; 339 : out3[k].re = out1[k].re - Zdifr; | |
| mov edx, DWORD PTR _out1$1$[esp+24] | |
| addss xmm0, xmm1 | |
| movss DWORD PTR [eax+ecx-8], xmm2 | |
| ; 340 : out3[k].im = out1[k].im - Zdifi; | |
| mov eax, DWORD PTR tv834[esp+28] | |
| movss DWORD PTR [esi-8], xmm0 | |
| movss xmm0, DWORD PTR [edx+esi-8] | |
| subss xmm0, xmm7 | |
| movss DWORD PTR [esi+ebp-8], xmm0 | |
| movss xmm0, DWORD PTR [eax+ecx-8] | |
| subss xmm0, xmm6 | |
| movss DWORD PTR [edi+ecx-8], xmm0 | |
| ; 341 : out1[k].re += Zdifr; | |
| ; 342 : out1[k].im += Zdifi; | |
| movss xmm0, DWORD PTR [eax+ecx-8] | |
| addss xmm7, DWORD PTR [edx+esi-8] | |
| addss xmm0, xmm6 | |
| movss DWORD PTR [edx+esi-8], xmm7 | |
| movss DWORD PTR [eax+ecx-8], xmm0 | |
| mov eax, DWORD PTR tv838[esp+24] | |
| jne $LL3@ffts_rec | |
| $LN1@ffts_rec: | |
| pop edi | |
| pop esi | |
| pop ebp | |
| pop ebx |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment