rygorous/fp16_to_32.asm

## fp16_to_32.asm
; input: 4x F16 in XMM0 (low words of each DWord)
; original idea+implementation by Dean Macri

; WARNING: copy & pasted together from other code, this ver is untested!!
; though the original version was definitely correct.

bits 32

section .data

    FP32_no_sign                times 4 dd 0x7FFFFFFF
    FP32_sign_bit               times 4 dd 0x80000000

    FP16_FP32_sgn_adj           times 4 dd 0x70000000
    FP16_FP32_sgn_adj2          times 4 dd 0x8FFFFFFF
    FP16_FP32_denorm            times 4 dd 0x38000000
    FP16_FP32_denorm_adj        times 4 dd 0x00800000
    FP16_FP32_exp_adj           times 4 dd 0x38000000
    FP16_exp_shifted            times 4 dd 0x0F800000
    FP16_exp_adjust_for_NaN     times 4 dd 0x7F800000
    FP16_exp_adjust_for_Zero    times 4 dd 0x7F800000

    FP16_FP32_exp_adj_for_Zero  dd 0x38000000, 0x38000000, 0x38000000, 0x38000000
                                dd 0xB8000000, 0x38000000, 0x38000000, 0x38000000
                                dd 0x38000000, 0xB8000000, 0x38000000, 0x38000000
                                dd 0xB8000000, 0xB8000000, 0x38000000, 0x38000000
                                dd 0x38000000, 0x38000000, 0xB8000000, 0x38000000
                                dd 0xB8000000, 0x38000000, 0xB8000000, 0x38000000
                                dd 0x38000000, 0xB8000000, 0xB8000000, 0x38000000
                                dd 0xB8000000, 0xB8000000, 0xB8000000, 0x38000000
                                dd 0x38000000, 0x38000000, 0x38000000, 0xB8000000
                                dd 0xB8000000, 0x38000000, 0x38000000, 0xB8000000
                                dd 0x38000000, 0xB8000000, 0x38000000, 0xB8000000
                                dd 0xB8000000, 0xB8000000, 0x38000000, 0xB8000000
                                dd 0x38000000, 0x38000000, 0xB8000000, 0xB8000000
                                dd 0xB8000000, 0x38000000, 0xB8000000, 0xB8000000
                                dd 0x38000000, 0xB8000000, 0xB8000000, 0xB8000000
                                dd 0xB8000000, 0xB8000000, 0xB8000000, 0xB8000000


section .text

f16tof32:
    ; Shift the mantissa to the correct place (bit 23 in F32 from bit 10 in F16)
    pslld     xmm0, 13
    ; Get the sign bit set appropriately
    paddd     xmm0, [FP16_FP32_sgn_adj]
    pand      xmm0, [FP16_FP32_sgn_adj2]

    ; Save copy, adjust exponent
    movdqa    xmm1, xmm0
    paddd     xmm0, [FP16_FP32_exp_adj]
    ; Check for NaNs, inf
    pand      xmm1, [FP16_exp_shifted]
    pcmpeqd   xmm1, [FP16_exp_shifted]
    pand      xmm1, [FP16_exp_adjust_for_NaN]
    por       xmm0, xmm1

    lea       edx, [FP16_FP32_exp_adj_for_Zero]
    ; Check for zeros/denorms.  This is a pain.  We need to
    ; figure out which FP16 values had a zero value for the biased
    ; exponent.  THEN, we have to subtract away the new exponent,
    ; so that if we had a denorm orginally, we'll get rid of the
    ; implicit one we created in the FP32 format.
    ;
    movmskps  eax, xmm0
    movdqa    xmm1, xmm0
    shl       eax, 4
    pand      xmm1, [FP16_exp_adjust_for_Zero]
    pcmpeqd   xmm1, [FP16_FP32_exp_adj]
    pand      xmm1, [edx + eax]

    ; Subtract off the implicit 1 if we had a denorm, make the value
    ; zero if it should be zero.  Unfortunately, negative zero become positive
    ; so we have to put the sign back.
    ;
    subps     xmm0, xmm1
    ; Find the values < 2^(-15) (Denorms)
    movaps    xmm2, [FP32_no_sign]
    andps     xmm2, xmm0
    cmpleps   xmm2, [FP16_FP32_denorm]
    andps     xmm2, [FP16_FP32_denorm_adj]
    paddd     xmm0, xmm2
    ; Get the zeros back
    movdqa    xmm2, xmm0
    pcmpeqd   xmm0, [FP16_FP32_denorm_adj]
    andnps    xmm0, xmm2
    ; Put the sign bits back
    pand      xmm1, [FP32_sign_bit]
    por       xmm0, xmm1

    ; All done!
    ret
	; input: 4x F16 in XMM0 (low words of each DWord)
	; original idea+implementation by Dean Macri

	; WARNING: copy & pasted together from other code, this ver is untested!!
	; though the original version was definitely correct.

	bits 32

	section .data

	FP32_no_sign times 4 dd 0x7FFFFFFF
	FP32_sign_bit times 4 dd 0x80000000

	FP16_FP32_sgn_adj times 4 dd 0x70000000
	FP16_FP32_sgn_adj2 times 4 dd 0x8FFFFFFF
	FP16_FP32_denorm times 4 dd 0x38000000
	FP16_FP32_denorm_adj times 4 dd 0x00800000
	FP16_FP32_exp_adj times 4 dd 0x38000000
	FP16_exp_shifted times 4 dd 0x0F800000
	FP16_exp_adjust_for_NaN times 4 dd 0x7F800000
	FP16_exp_adjust_for_Zero times 4 dd 0x7F800000

	FP16_FP32_exp_adj_for_Zero dd 0x38000000, 0x38000000, 0x38000000, 0x38000000
	dd 0xB8000000, 0x38000000, 0x38000000, 0x38000000
	dd 0x38000000, 0xB8000000, 0x38000000, 0x38000000
	dd 0xB8000000, 0xB8000000, 0x38000000, 0x38000000
	dd 0x38000000, 0x38000000, 0xB8000000, 0x38000000
	dd 0xB8000000, 0x38000000, 0xB8000000, 0x38000000
	dd 0x38000000, 0xB8000000, 0xB8000000, 0x38000000
	dd 0xB8000000, 0xB8000000, 0xB8000000, 0x38000000
	dd 0x38000000, 0x38000000, 0x38000000, 0xB8000000
	dd 0xB8000000, 0x38000000, 0x38000000, 0xB8000000
	dd 0x38000000, 0xB8000000, 0x38000000, 0xB8000000
	dd 0xB8000000, 0xB8000000, 0x38000000, 0xB8000000
	dd 0x38000000, 0x38000000, 0xB8000000, 0xB8000000
	dd 0xB8000000, 0x38000000, 0xB8000000, 0xB8000000
	dd 0x38000000, 0xB8000000, 0xB8000000, 0xB8000000
	dd 0xB8000000, 0xB8000000, 0xB8000000, 0xB8000000


	section .text

	f16tof32:
	; Shift the mantissa to the correct place (bit 23 in F32 from bit 10 in F16)
	pslld xmm0, 13
	; Get the sign bit set appropriately
	paddd xmm0, [FP16_FP32_sgn_adj]
	pand xmm0, [FP16_FP32_sgn_adj2]

	; Save copy, adjust exponent
	movdqa xmm1, xmm0
	paddd xmm0, [FP16_FP32_exp_adj]
	; Check for NaNs, inf
	pand xmm1, [FP16_exp_shifted]
	pcmpeqd xmm1, [FP16_exp_shifted]
	pand xmm1, [FP16_exp_adjust_for_NaN]
	por xmm0, xmm1

	lea edx, [FP16_FP32_exp_adj_for_Zero]
	; Check for zeros/denorms. This is a pain. We need to
	; figure out which FP16 values had a zero value for the biased
	; exponent. THEN, we have to subtract away the new exponent,
	; so that if we had a denorm orginally, we'll get rid of the
	; implicit one we created in the FP32 format.
	;
	movmskps eax, xmm0
	movdqa xmm1, xmm0
	shl eax, 4
	pand xmm1, [FP16_exp_adjust_for_Zero]
	pcmpeqd xmm1, [FP16_FP32_exp_adj]
	pand xmm1, [edx + eax]

	; Subtract off the implicit 1 if we had a denorm, make the value
	; zero if it should be zero. Unfortunately, negative zero become positive
	; so we have to put the sign back.
	;
	subps xmm0, xmm1
	; Find the values < 2^(-15) (Denorms)
	movaps xmm2, [FP32_no_sign]
	andps xmm2, xmm0
	cmpleps xmm2, [FP16_FP32_denorm]
	andps xmm2, [FP16_FP32_denorm_adj]
	paddd xmm0, xmm2
	; Get the zeros back
	movdqa xmm2, xmm0
	pcmpeqd xmm0, [FP16_FP32_denorm_adj]
	andnps xmm0, xmm2
	; Put the sign bits back
	pand xmm1, [FP32_sign_bit]
	por xmm0, xmm1

	; All done!
	ret