because of the new APX stuff i got to thinking about different changes to x86 over time and how they affect code, like general code not just the manual intrinsics case...
VEX was part of AVX and introduced ternaries which shorted the encoding.
; pre-VEX
movaps xmm1, xmm2
subss xmm1, xmm3
; VEX
vsubss xmm1, xmm2, xmm3 ; less bytes mostly
BMI added popcount as an instruction so we no longer need to emulate it:
; pre-BMI
mov eax, edi
shr eax
and eax, 1431655765
sub edi, eax
mov eax, edi
and eax, 858993459
shr edi, 2
and edi, 858993459
add edi, eax
mov eax, edi
shr eax, 4
add eax, edi
and eax, 252645135
imul eax, eax, 16843009
shr eax, 24
ret
; BMI
popcnt eax, edi
ret
BMI-2 introduced new shift ops which means we don't need to use CL to do shifts (the shift immediate case didn't use CL but the rest did):
; pre BMI-2
mov cl, si
mov rax, [rdi]
shl rax, cl ; forced use of CL, clobbers FLAGS
; BMI-2
shlx rax, [rdi], rsi ; ternary and allowed to use other registers, doesn't update FLAGS
we can now do conditional loads which don't fault on failure so even thought rdi
would have a NULL pointer in it, the segfault is suppressed.
return a ? *a : 0
; pre-APX
test rdi, rdi
je .damn
mov eax, dword ptr [rdi]
ret
damn:
xor eax, eax
ret
; APX
xor rax, rax
test rdi, rdi
cfcmovne rax, [rdi]
ret
This would've been cool but APX doesn't support byte cmov stores, couldve made for nice conditional card marking in concurrent GCs:
Obj* rdi = ...;
if (card[rdi >> 9] != 0) card[rdi >> 9] = 0;
rdi->some_field = rcx
mov r11, CARD_TABLE
mov r9, rdi
shr r9, 9
add r11, r9
+ xor r9, r9 ; r9 expired, we can reuse it
cmp byte [r11], 0
+ cfcmovne byte [r11], r9 ; not possible... :(
- je skip
- ; don't write the card if it's already written to avoid contention
- mov byte [r11], 0
- skip:
; pointer.some_field = rcx;
mov [rdi + 16], rcx