SSE round/floor (untested)
// Essentially the same algorithm as | |
// https://github.com/ispc/ispc/blob/master/builtins/target-sse2-common.ll#L153 | |
// but slightly better impl (orps on magic const instead of two xorps). | |
// | |
// NOTE: this really is not tested, and it's likely to lose a bit of precision | |
// for x in [2^22,2^23). | |
// | |
// NOTE 2: Yup, confirmed, it doesn't do the right thing for large integer values | |
// representable as float. | |
__m128 round_to_int(__m128 x) | |
{ | |
__m128 sign = _mm_and_ps(x, _mm_set1_ps(-0.0f)); | |
__m128 magic = _mm_or_ps(sign, _mm_set1_ps(8388608.0f)); // 2^23 | |
__m128 round1 = _mm_add_ps(x, magic); | |
__m128 round2 = _mm_sub_ps(round1, magic); | |
return round2; | |
} | |
__m128 floor(__m128 x) | |
{ | |
__m128 y = round_to_int(x); | |
__m128 cmp = _mm_cmplt_ps(x, y); // x < round(x)? | |
__m128 fudge = _mm_and_ps(cmp, _mm_set1_ps(1.0f)); // 1.0f if x < round(x) | |
__m128 result = _mm_sub_ps(y, fudge); | |
} | |
// This variant of "round" should work, though. | |
// But I think it's more work than going through ints. | |
// (Still completely untested, mind.) | |
__m128 round_to_int_fixed(__m128 x) | |
{ | |
__m128 magic = _mm_set1_ps(8388608.0f); // 2^23 | |
__m128 xabs = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); | |
__m128 sign = _mm_xor_ps(x, xabs); | |
__m128 smallenough = _mm_cmplt_ps(xabs, magic); | |
__m128 bias = _mm_and_ps(smallenough, magic); | |
__m128 round1 = _mm_add_ps(xabs, bias); | |
__m128 round2 = _mm_sub_ps(round1, bias); | |
__m128 round3 = _mm_xor_ps(round2, sign); | |
return round3; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
round_to_int(0.5f) and round_to_int_fixed(0.5f) return 0.0f - it seems this code has issues. Tested in MSVC 2013.