Skip to content

Instantly share code, notes, and snippets.

@blueskythlikesclouds
Last active October 31, 2022 16:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blueskythlikesclouds/fc50fb882dd5698510869e1a3ad9d84d to your computer and use it in GitHub Desktop.
Save blueskythlikesclouds/fc50fb882dd5698510869e1a3ad9d84d to your computer and use it in GitHub Desktop.
Vectorization & unrolling test among compilers through Compiler Explorer
; -O3 -mavx2 -ffast-math
VectorizationTest(Vector<float, 3> const&, Vector<float, 3> const&): # @VectorizationTest(Vector<float, 3> const&, Vector<float, 3> const&)
vmovaps xmm0, xmmword ptr [rdi]
vmovaps xmm1, xmmword ptr [rsi]
vrcpps xmm2, xmm1
vmulps xmm3, xmm0, xmm2
vmulps xmm4, xmm1, xmm3
vsubps xmm4, xmm0, xmm4
vmulps xmm2, xmm2, xmm4
vaddps xmm2, xmm3, xmm2
vaddps xmm8, xmm1, xmm0
vsubps xmm4, xmm0, xmm1
vrcpps xmm5, xmm4
vmulps xmm6, xmm8, xmm8
vmulps xmm7, xmm6, xmm5
vmulps xmm3, xmm4, xmm7
vsubps xmm3, xmm3, xmm6
vmulps xmm3, xmm5, xmm3
vsubps xmm3, xmm3, xmm7
vmulps xmm0, xmm1, xmm0
vrcpps xmm1, xmm0
vmulps xmm5, xmm4, xmm4
vmulps xmm6, xmm5, xmm1
vmulps xmm7, xmm0, xmm6
vsubps xmm5, xmm7, xmm5
vmulps xmm1, xmm1, xmm5
vsubps xmm1, xmm1, xmm6
vaddps xmm1, xmm1, xmm3
vrcpps xmm3, xmm2
vmulps xmm5, xmm0, xmm0
vmulps xmm6, xmm5, xmm3
vmulps xmm7, xmm2, xmm6
vsubps xmm5, xmm7, xmm5
vmulps xmm3, xmm3, xmm5
vsubps xmm3, xmm3, xmm6
vaddps xmm5, xmm8, xmm8
vaddps xmm0, xmm0, xmm4
vaddps xmm0, xmm0, xmm5
vaddps xmm0, xmm2, xmm0
vmulps xmm2, xmm2, xmm2
vsubps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, xmm1
vaddps xmm0, xmm3, xmm0
vpermilpd xmm1, xmm0, 1 # xmm1 = xmm0[1,0]
ret
; -O3 -mavx2 -ffast-math
VectorizationTest(Vector<float, 3> const&, Vector<float, 3> const&):
vmovaps xmm1, XMMWORD PTR [rsi]
vmovaps xmm2, XMMWORD PTR [rdi]
vrcpps xmm0, xmm1
vsubps xmm4, xmm2, xmm1
vrcpps xmm5, xmm4
vmulps xmm3, xmm0, xmm1
vmulps xmm6, xmm5, xmm4
vmulps xmm3, xmm0, xmm3
vaddps xmm0, xmm0, xmm0
vmulps xmm6, xmm5, xmm6
vaddps xmm5, xmm5, xmm5
vsubps xmm0, xmm0, xmm3
vmulps xmm3, xmm2, xmm1
vmulps xmm0, xmm2, xmm0
vaddps xmm2, xmm2, xmm1
vsubps xmm5, xmm5, xmm6
vaddps xmm1, xmm2, xmm2
vmulps xmm2, xmm2, xmm2
vmulps xmm2, xmm2, xmm5
vsubps xmm1, xmm1, xmm2
vrcpps xmm2, xmm3
vaddps xmm1, xmm1, xmm4
vmulps xmm4, xmm4, xmm4
vmulps xmm5, xmm2, xmm3
vmulps xmm5, xmm2, xmm5
vaddps xmm2, xmm2, xmm2
vsubps xmm2, xmm2, xmm5
vmulps xmm4, xmm4, xmm2
vrcpps xmm2, xmm0
vsubps xmm1, xmm1, xmm4
vmulps xmm4, xmm2, xmm0
vaddps xmm1, xmm1, xmm3
vmulps xmm3, xmm3, xmm3
vmulps xmm4, xmm2, xmm4
vaddps xmm2, xmm2, xmm2
vsubps xmm2, xmm2, xmm4
vmulps xmm3, xmm3, xmm2
vsubps xmm1, xmm1, xmm3
vaddps xmm1, xmm1, xmm0
vmulps xmm0, xmm0, xmm0
vsubps xmm7, xmm1, xmm0
vmovaps XMMWORD PTR [rsp-24], xmm7
mov rax, QWORD PTR [rsp-16]
vmovq xmm0, QWORD PTR [rsp-24]
vmovq xmm1, rax
ret
; /O2 /arch:AVX2 /fp:fast
Vector<float,3> VectorizationTest(Vector<float,3> const &,Vector<float,3> const &) PROC ; VectorizationTest, COMDAT
$LN407:
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rdi
push rbp
lea rbp, QWORD PTR [rsp-87]
sub rsp, 176 ; 000000b0H
mov rax, QWORD PTR __security_cookie
xor rax, rsp
mov QWORD PTR __$ArrayPad$[rbp-89], rax
mov r10, r8
lea r11, QWORD PTR $T21[rbp-89]
xor eax, eax
mov r9, r10
sub r9, rdx
mov rdi, rcx
sub r11, rdx
mov r8d, eax
mov rbx, rdx
$LL6@Vectorizat:
vrcpps xmm0, XMMWORD PTR [r9+rbx]
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, XMMWORD PTR [r9+rbx]
vmulps xmm1, xmm2, XMMWORD PTR [rbx]
add r8d, 4
vmovups XMMWORD PTR [r11+rbx], xmm1
lea rbx, QWORD PTR [rbx+16]
cmp r8d, 4
jb SHORT $LL6@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN5@Vectorizat
lea rcx, QWORD PTR [rdx+rcx*4]
npad 7
$LL220@Vectorizat:
vmovss xmm0, DWORD PTR [rcx]
vdivss xmm1, xmm0, DWORD PTR [rcx+r9]
inc r8d
vmovss DWORD PTR [rcx+r11], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL220@Vectorizat
$LN5@Vectorizat:
lea r11, QWORD PTR $T8[rbp-89]
mov r8d, eax
sub r11, rdx
mov rcx, rdx
npad 6
$LL11@Vectorizat:
vrcpps xmm0, XMMWORD PTR [r9+rcx]
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, XMMWORD PTR [r9+rcx]
vmulps xmm1, xmm2, XMMWORD PTR [rcx]
add r8d, 4
vmovups XMMWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL11@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN10@Vectorizat
lea rcx, QWORD PTR [rdx+rcx*4]
npad 7
$LL222@Vectorizat:
vmovss xmm0, DWORD PTR [rcx]
vdivss xmm1, xmm0, DWORD PTR [r9+rcx]
inc r8d
vmovss DWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL222@Vectorizat
$LN10@Vectorizat:
mov ecx, eax
mov r8, rax
$LL16@Vectorizat:
vmovups xmm1, XMMWORD PTR $T21[rbp+r8-89]
vmulps xmm1, xmm1, XMMWORD PTR $T8[rbp+r8-89]
add ecx, 4
vmovups XMMWORD PTR $T29[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL16@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN15@Vectorizat
lea r8, QWORD PTR [r8*4]
npad 12
$LL224@Vectorizat:
vmovss xmm0, DWORD PTR $T21[rbp+r8-89]
vmulss xmm1, xmm0, DWORD PTR $T8[rbp+r8-89]
inc ecx
vmovss DWORD PTR $T29[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL224@Vectorizat
$LN15@Vectorizat:
lea r11, QWORD PTR $T28[rbp-89]
mov r8d, eax
sub r11, rdx
mov rcx, rdx
npad 3
$LL21@Vectorizat:
vrcpps xmm0, XMMWORD PTR [r9+rcx]
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, XMMWORD PTR [r9+rcx]
vmulps xmm1, xmm2, XMMWORD PTR [rcx]
add r8d, 4
vmovups XMMWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL21@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN20@Vectorizat
lea rcx, QWORD PTR [rdx+rcx*4]
npad 7
$LL226@Vectorizat:
vmovss xmm0, DWORD PTR [rcx]
vdivss xmm1, xmm0, DWORD PTR [r9+rcx]
inc r8d
vmovss DWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL226@Vectorizat
$LN20@Vectorizat:
lea r11, QWORD PTR $T2[rbp-89]
mov r8d, eax
sub r11, rdx
mov rcx, rdx
npad 6
$LL26@Vectorizat:
vrcpps xmm0, XMMWORD PTR [r9+rcx]
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, XMMWORD PTR [r9+rcx]
vmulps xmm1, xmm2, XMMWORD PTR [rcx]
add r8d, 4
vmovups XMMWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL26@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN25@Vectorizat
lea rcx, QWORD PTR [rdx+rcx*4]
npad 7
$LL228@Vectorizat:
vmovss xmm0, DWORD PTR [rcx]
vdivss xmm1, xmm0, DWORD PTR [r9+rcx]
inc r8d
vmovss DWORD PTR [r11+rcx], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL228@Vectorizat
$LN25@Vectorizat:
sub rdx, r10
lea r9, QWORD PTR $T24[rbp-89]
sub r9, r10
mov ecx, eax
mov r8, r10
npad 4
$LL31@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vmulps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL31@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN30@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL230@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vmulss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL230@Vectorizat
$LN30@Vectorizat:
lea r9, QWORD PTR $T9[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL36@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vmulps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL36@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN35@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL232@Vectorizat:
vmovss xmm0, DWORD PTR [r8+rdx]
vmulss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r8+r9], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL232@Vectorizat
$LN35@Vectorizat:
mov ecx, eax
mov r8, rax
$LL41@Vectorizat:
vmovups xmm1, XMMWORD PTR $T24[rbp+r8-89]
vmulps xmm1, xmm1, XMMWORD PTR $T9[rbp+r8-89]
add ecx, 4
vmovups XMMWORD PTR $T13[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL41@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN40@Vectorizat
lea r8, QWORD PTR [r8*4]
npad 13
$LL234@Vectorizat:
vmovss xmm0, DWORD PTR $T24[rbp+r8-89]
vmulss xmm1, xmm0, DWORD PTR $T9[rbp+r8-89]
inc ecx
vmovss DWORD PTR $T13[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL234@Vectorizat
$LN40@Vectorizat:
mov r8d, eax
mov rcx, rax
npad 10
$LL46@Vectorizat:
vmovups xmm1, XMMWORD PTR $T2[rbp+rcx-89]
vrcpps xmm0, xmm1
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, xmm1
vmulps xmm1, xmm2, XMMWORD PTR $T13[rbp+rcx-89]
add r8d, 4
vmovups XMMWORD PTR $T23[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL46@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN45@Vectorizat
lea rcx, QWORD PTR [rcx*4]
npad 14
$LL236@Vectorizat:
vmovss xmm0, DWORD PTR $T13[rbp+rcx-89]
vdivss xmm1, xmm0, DWORD PTR $T2[rbp+rcx-89]
inc r8d
vmovss DWORD PTR $T23[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL236@Vectorizat
$LN45@Vectorizat:
lea r9, QWORD PTR $T31[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 5
$LL51@Vectorizat:
vmovups xmm1, XMMWORD PTR [r8+rdx]
vmulps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL51@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN50@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL238@Vectorizat:
vmovss xmm0, DWORD PTR [r8+rdx]
vmulss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r8+r9], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL238@Vectorizat
$LN50@Vectorizat:
lea r9, QWORD PTR $T17[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL56@Vectorizat:
vmovups xmm1, XMMWORD PTR [r8+rdx]
vmulps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL56@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN55@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL240@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vmulss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL240@Vectorizat
$LN55@Vectorizat:
lea r9, QWORD PTR $T12[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL61@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vsubps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL61@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN60@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL242@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vsubss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL242@Vectorizat
$LN60@Vectorizat:
lea r9, QWORD PTR $T4[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL66@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vsubps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL66@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN65@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL244@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vsubss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL244@Vectorizat
$LN65@Vectorizat:
mov ecx, eax
mov r8, rax
$LL71@Vectorizat:
vmovups xmm1, XMMWORD PTR $T12[rbp+r8-89]
vmulps xmm1, xmm1, XMMWORD PTR $T4[rbp+r8-89]
add ecx, 4
vmovups XMMWORD PTR $T14[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL71@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN70@Vectorizat
lea r8, QWORD PTR [r8*4]
npad 13
$LL246@Vectorizat:
vmovss xmm0, DWORD PTR $T12[rbp+r8-89]
vmulss xmm1, xmm0, DWORD PTR $T4[rbp+r8-89]
inc ecx
vmovss DWORD PTR $T14[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL246@Vectorizat
$LN70@Vectorizat:
mov r8d, eax
mov rcx, rax
npad 10
$LL76@Vectorizat:
vmovups xmm1, XMMWORD PTR $T17[rbp+rcx-89]
vrcpps xmm0, xmm1
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, xmm1
vmulps xmm1, xmm2, XMMWORD PTR $T14[rbp+rcx-89]
add r8d, 4
vmovups XMMWORD PTR $T11[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL76@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN75@Vectorizat
lea rcx, QWORD PTR [rcx*4]
npad 14
$LL248@Vectorizat:
vmovss xmm0, DWORD PTR $T14[rbp+rcx-89]
vdivss xmm1, xmm0, DWORD PTR $T17[rbp+rcx-89]
inc r8d
vmovss DWORD PTR $T11[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL248@Vectorizat
$LN75@Vectorizat:
lea r9, QWORD PTR $T30[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 5
$LL81@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vsubps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL81@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN80@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL250@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vsubss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL250@Vectorizat
$LN80@Vectorizat:
lea r9, QWORD PTR $T25[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL86@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vsubps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL86@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN85@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL252@Vectorizat:
vmovss xmm0, DWORD PTR [r8+rdx]
vsubss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL252@Vectorizat
$LN85@Vectorizat:
lea r9, QWORD PTR $T6[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL91@Vectorizat:
vmovups xmm1, XMMWORD PTR [r8+rdx]
vaddps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL91@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN90@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL254@Vectorizat:
vmovss xmm0, DWORD PTR [r8+rdx]
vaddss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r8+r9], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL254@Vectorizat
$LN90@Vectorizat:
lea r9, QWORD PTR $T20[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL96@Vectorizat:
vmovups xmm1, XMMWORD PTR [r8+rdx]
vaddps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL96@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN95@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL256@Vectorizat:
vmovss xmm0, DWORD PTR [r8+rdx]
vaddss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r8+r9], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL256@Vectorizat
$LN95@Vectorizat:
mov ecx, eax
mov r8, rax
$LL101@Vectorizat:
vmovups xmm1, XMMWORD PTR $T6[rbp+r8-89]
vmulps xmm1, xmm1, XMMWORD PTR $T20[rbp+r8-89]
add ecx, 4
vmovups XMMWORD PTR $T15[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL101@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN100@Vectorizat
lea r8, QWORD PTR [r8*4]
npad 13
$LL258@Vectorizat:
vmovss xmm0, DWORD PTR $T6[rbp+r8-89]
vmulss xmm1, xmm0, DWORD PTR $T20[rbp+r8-89]
inc ecx
vmovss DWORD PTR $T15[rbp+r8-89], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL258@Vectorizat
$LN100@Vectorizat:
mov r8d, eax
mov rcx, rax
npad 10
$LL106@Vectorizat:
vmovups xmm1, XMMWORD PTR $T25[rbp+rcx-89]
vrcpps xmm0, xmm1
vaddps xmm2, xmm0, xmm0
vmulps xmm0, xmm0, xmm0
vfnmadd231ps xmm2, xmm0, xmm1
vmulps xmm1, xmm2, XMMWORD PTR $T15[rbp+rcx-89]
add r8d, 4
vmovups XMMWORD PTR $T19[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp r8d, 4
jb SHORT $LL106@Vectorizat
movsxd rcx, r8d
cmp rcx, 4
jae SHORT $LN105@Vectorizat
lea rcx, QWORD PTR [rcx*4]
npad 14
$LL260@Vectorizat:
vmovss xmm0, DWORD PTR $T15[rbp+rcx-89]
vdivss xmm1, xmm0, DWORD PTR $T25[rbp+rcx-89]
inc r8d
vmovss DWORD PTR $T19[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp r8d, 4
jb SHORT $LL260@Vectorizat
$LN105@Vectorizat:
lea r9, QWORD PTR $T26[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 5
$LL111@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vaddps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL111@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN110@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL262@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vaddss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL262@Vectorizat
$LN110@Vectorizat:
lea r9, QWORD PTR $T7[rbp-89]
mov ecx, eax
sub r9, r10
mov r8, r10
npad 8
$LL116@Vectorizat:
vmovups xmm1, XMMWORD PTR [rdx+r8]
vaddps xmm1, xmm1, XMMWORD PTR [r8]
add ecx, 4
vmovups XMMWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+16]
cmp ecx, 4
jb SHORT $LL116@Vectorizat
movsxd r8, ecx
cmp r8, 4
jae SHORT $LN115@Vectorizat
lea r8, QWORD PTR [r10+r8*4]
npad 6
$LL264@Vectorizat:
vmovss xmm0, DWORD PTR [rdx+r8]
vaddss xmm1, xmm0, DWORD PTR [r8]
inc ecx
vmovss DWORD PTR [r9+r8], xmm1
lea r8, QWORD PTR [r8+4]
cmp ecx, 4
jb SHORT $LL264@Vectorizat
$LN115@Vectorizat:
mov ecx, eax
mov rdx, rax
$LL121@Vectorizat:
vmovups xmm1, XMMWORD PTR $T26[rbp+rdx-89]
vaddps xmm1, xmm1, XMMWORD PTR $T7[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T16[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL121@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN120@Vectorizat
lea rdx, QWORD PTR [rdx*4]
$LL266@Vectorizat:
vmovss xmm0, DWORD PTR $T26[rbp+rdx-89]
vaddss xmm1, xmm0, DWORD PTR $T7[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T16[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL266@Vectorizat
$LN120@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL126@Vectorizat:
vmovups xmm0, XMMWORD PTR $T16[rbp+rdx-89]
vsubps xmm1, xmm0, XMMWORD PTR $T19[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T5[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL126@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN125@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL268@Vectorizat:
vmovss xmm0, DWORD PTR $T16[rbp+rdx-89]
vsubss xmm1, xmm0, DWORD PTR $T19[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T5[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL268@Vectorizat
$LN125@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL131@Vectorizat:
vmovups xmm1, XMMWORD PTR $T30[rbp+rdx-89]
vaddps xmm1, xmm1, XMMWORD PTR $T5[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T18[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL131@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN130@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL270@Vectorizat:
vmovss xmm0, DWORD PTR $T30[rbp+rdx-89]
vaddss xmm1, xmm0, DWORD PTR $T5[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T18[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL270@Vectorizat
$LN130@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL136@Vectorizat:
vmovups xmm0, XMMWORD PTR $T18[rbp+rdx-89]
vsubps xmm1, xmm0, XMMWORD PTR $T11[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T3[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL136@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN135@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL272@Vectorizat:
vmovss xmm0, DWORD PTR $T18[rbp+rdx-89]
vsubss xmm1, xmm0, DWORD PTR $T11[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T3[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL272@Vectorizat
$LN135@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL141@Vectorizat:
vmovups xmm1, XMMWORD PTR $T31[rbp+rdx-89]
vaddps xmm1, xmm1, XMMWORD PTR $T3[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T10[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL141@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN140@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL274@Vectorizat:
vmovss xmm0, DWORD PTR $T31[rbp+rdx-89]
vaddss xmm1, xmm0, DWORD PTR $T3[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T10[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL274@Vectorizat
$LN140@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL146@Vectorizat:
vmovups xmm0, XMMWORD PTR $T10[rbp+rdx-89]
vsubps xmm1, xmm0, XMMWORD PTR $T23[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T1[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL146@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN145@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL276@Vectorizat:
vmovss xmm0, DWORD PTR $T10[rbp+rdx-89]
vsubss xmm1, xmm0, DWORD PTR $T23[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T1[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL276@Vectorizat
$LN145@Vectorizat:
mov ecx, eax
mov rdx, rax
npad 14
$LL151@Vectorizat:
vmovups xmm1, XMMWORD PTR $T28[rbp+rdx-89]
vaddps xmm1, xmm1, XMMWORD PTR $T1[rbp+rdx-89]
add ecx, 4
vmovups XMMWORD PTR $T22[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+16]
cmp ecx, 4
jb SHORT $LL151@Vectorizat
movsxd rdx, ecx
cmp rdx, 4
jae SHORT $LN150@Vectorizat
lea rdx, QWORD PTR [rdx*4]
npad 1
$LL278@Vectorizat:
vmovss xmm0, DWORD PTR $T28[rbp+rdx-89]
vaddss xmm1, xmm0, DWORD PTR $T1[rbp+rdx-89]
inc ecx
vmovss DWORD PTR $T22[rbp+rdx-89], xmm1
lea rdx, QWORD PTR [rdx+4]
cmp ecx, 4
jb SHORT $LL278@Vectorizat
$LN150@Vectorizat:
mov rcx, rax
$LL156@Vectorizat:
vmovups xmm0, XMMWORD PTR $T22[rbp+rcx-89]
vsubps xmm1, xmm0, XMMWORD PTR $T29[rbp+rcx-89]
add eax, 4
vmovups XMMWORD PTR $T27[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+16]
cmp eax, 4
jb SHORT $LL156@Vectorizat
movsxd rcx, eax
cmp rcx, 4
jae SHORT $LN155@Vectorizat
lea rcx, QWORD PTR [rcx*4]
npad 1
$LL280@Vectorizat:
vmovss xmm0, DWORD PTR $T22[rbp+rcx-89]
vsubss xmm1, xmm0, DWORD PTR $T29[rbp+rcx-89]
inc eax
vmovss DWORD PTR $T27[rbp+rcx-89], xmm1
lea rcx, QWORD PTR [rcx+4]
cmp eax, 4
jb SHORT $LL280@Vectorizat
$LN155@Vectorizat:
vmovups xmm0, XMMWORD PTR $T27[rbp-89]
vmovups XMMWORD PTR [rdi], xmm0
mov rax, rdi
mov rcx, QWORD PTR __$ArrayPad$[rbp-89]
xor rcx, rsp
call __security_check_cookie
lea r11, QWORD PTR [rsp+176]
mov rbx, QWORD PTR [r11+16]
mov rdi, QWORD PTR [r11+24]
mov rsp, r11
pop rbp
ret 0
#include <cmath>
#include <cstdio>
template<typename T, int N>
struct alignas((sizeof(T) * N + 7) & ~7) Vector
{
T data[(N + 3) & ~3];
#define MAKE_VECTOR_OPERATOR(type) \
Vector operator type(const Vector& other) const \
{ \
Vector result; \
for (int i = 0; i < sizeof(data) / sizeof(T); i++) \
{ \
result.data[i] = data[i] type other.data[i]; \
} \
return result; \
} \
void operator type##=(const Vector& other) \
{ \
for (int i = 0; i < sizeof(data) / sizeof(T); i++) \
{ \
data[i] type##= other.data[i]; \
} \
} \
Vector operator type(const T other) const \
{ \
Vector result; \
for (int i = 0; i < sizeof(data) / sizeof(T); i++) \
{ \
result.data[i] = data[i] type other; \
} \
return result; \
} \
\
void operator type##=(const T other) \
{ \
for (int i = 0; i < sizeof(data) / sizeof(T); i++) \
{ \
data[i] type##= other; \
} \
}
MAKE_VECTOR_OPERATOR(+)
MAKE_VECTOR_OPERATOR(-)
MAKE_VECTOR_OPERATOR(*)
MAKE_VECTOR_OPERATOR(/)
};
using Vector2 = Vector<float, 2>;
using Vector3 = Vector<float, 3>;
using Vector4 = Vector<float, 4>;
Vector3 VectorizationTest(const Vector3& a, const Vector3& b)
{
return
(a + b) +
(a + b) -
(a + b) *
(a + b) /
(a - b) +
(a - b) -
(a - b) *
(a - b) /
(a * b) +
(a * b) -
(a * b) *
(a * b) /
(a / b) +
(a / b) -
(a / b) *
(a / b);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment