Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created March 21, 2012 04:37
Show Gist options
  • Save rygorous/2144419 to your computer and use it in GitHub Desktop.
Save rygorous/2144419 to your computer and use it in GitHub Desktop.
half->float using SSE2
; input: 4x F16 in XMM0 (low words of each DWord)
; original idea+implementation by Dean Macri
; WARNING: copy & pasted together from other code, this ver is untested!!
; though the original version was definitely correct.
bits 32
section .data
FP32_no_sign times 4 dd 0x7FFFFFFF
FP32_sign_bit times 4 dd 0x80000000
FP16_FP32_sgn_adj times 4 dd 0x70000000
FP16_FP32_sgn_adj2 times 4 dd 0x8FFFFFFF
FP16_FP32_denorm times 4 dd 0x38000000
FP16_FP32_denorm_adj times 4 dd 0x00800000
FP16_FP32_exp_adj times 4 dd 0x38000000
FP16_exp_shifted times 4 dd 0x0F800000
FP16_exp_adjust_for_NaN times 4 dd 0x7F800000
FP16_exp_adjust_for_Zero times 4 dd 0x7F800000
FP16_FP32_exp_adj_for_Zero dd 0x38000000, 0x38000000, 0x38000000, 0x38000000
dd 0xB8000000, 0x38000000, 0x38000000, 0x38000000
dd 0x38000000, 0xB8000000, 0x38000000, 0x38000000
dd 0xB8000000, 0xB8000000, 0x38000000, 0x38000000
dd 0x38000000, 0x38000000, 0xB8000000, 0x38000000
dd 0xB8000000, 0x38000000, 0xB8000000, 0x38000000
dd 0x38000000, 0xB8000000, 0xB8000000, 0x38000000
dd 0xB8000000, 0xB8000000, 0xB8000000, 0x38000000
dd 0x38000000, 0x38000000, 0x38000000, 0xB8000000
dd 0xB8000000, 0x38000000, 0x38000000, 0xB8000000
dd 0x38000000, 0xB8000000, 0x38000000, 0xB8000000
dd 0xB8000000, 0xB8000000, 0x38000000, 0xB8000000
dd 0x38000000, 0x38000000, 0xB8000000, 0xB8000000
dd 0xB8000000, 0x38000000, 0xB8000000, 0xB8000000
dd 0x38000000, 0xB8000000, 0xB8000000, 0xB8000000
dd 0xB8000000, 0xB8000000, 0xB8000000, 0xB8000000
section .text
f16tof32:
; Shift the mantissa to the correct place (bit 23 in F32 from bit 10 in F16)
pslld xmm0, 13
; Get the sign bit set appropriately
paddd xmm0, [FP16_FP32_sgn_adj]
pand xmm0, [FP16_FP32_sgn_adj2]
; Save copy, adjust exponent
movdqa xmm1, xmm0
paddd xmm0, [FP16_FP32_exp_adj]
; Check for NaNs, inf
pand xmm1, [FP16_exp_shifted]
pcmpeqd xmm1, [FP16_exp_shifted]
pand xmm1, [FP16_exp_adjust_for_NaN]
por xmm0, xmm1
lea edx, [FP16_FP32_exp_adj_for_Zero]
; Check for zeros/denorms. This is a pain. We need to
; figure out which FP16 values had a zero value for the biased
; exponent. THEN, we have to subtract away the new exponent,
; so that if we had a denorm orginally, we'll get rid of the
; implicit one we created in the FP32 format.
;
movmskps eax, xmm0
movdqa xmm1, xmm0
shl eax, 4
pand xmm1, [FP16_exp_adjust_for_Zero]
pcmpeqd xmm1, [FP16_FP32_exp_adj]
pand xmm1, [edx + eax]
; Subtract off the implicit 1 if we had a denorm, make the value
; zero if it should be zero. Unfortunately, negative zero become positive
; so we have to put the sign back.
;
subps xmm0, xmm1
; Find the values < 2^(-15) (Denorms)
movaps xmm2, [FP32_no_sign]
andps xmm2, xmm0
cmpleps xmm2, [FP16_FP32_denorm]
andps xmm2, [FP16_FP32_denorm_adj]
paddd xmm0, xmm2
; Get the zeros back
movdqa xmm2, xmm0
pcmpeqd xmm0, [FP16_FP32_denorm_adj]
andnps xmm0, xmm2
; Put the sign bits back
pand xmm1, [FP32_sign_bit]
por xmm0, xmm1
; All done!
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment