Created
February 4, 2020 07:16
-
-
Save IJzerbaard/78d680151f47b72903a91dabe87633e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__m128i divu16(__m128i x, __m128i y, __m128i &remainder) | |
{ | |
__m128i bitrev1 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); | |
__m128i bitrev2 = _mm_slli_epi16(bitrev1, 4); | |
__m128i lownib = _mm_set1_epi8(15); | |
// swap adjacent bytes | |
__m128i bswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); | |
__m128i yr = _mm_shuffle_epi8(y, bswap); | |
// swap adjacent nibbles and bit-reverse nibbles | |
yr = _mm_or_si128( | |
_mm_shuffle_epi8(bitrev1, _mm_and_si128(_mm_srli_epi16(yr, 4), lownib)), | |
_mm_shuffle_epi8(bitrev2, _mm_and_si128(yr, lownib))); | |
// isolate rightmost set bit | |
__m128i z = _mm_and_si128(yr, _mm_sub_epi16(_mm_setzero_si128(), yr)); | |
// accurate to slightly less than 1 bit, need 4 Newton iterations | |
__m128i my =_mm_sub_epi16(_mm_setzero_si128(), y); | |
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my))); | |
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my))); | |
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my))); | |
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my))); | |
__m128i q = _mm_mulhi_epu16(x, z); | |
__m128i r = _mm_sub_epi16(x, _mm_mullo_epi16(y, q)); | |
// adjust for residual inaccuracy | |
__m128i m; | |
m = _mm_cmpeq_epi16(r, _mm_max_epu16(r, y)); | |
r = _mm_sub_epi16(r, _mm_and_si128(y, m)); | |
q = _mm_sub_epi16(q, m); | |
m = _mm_cmpeq_epi16(r, _mm_max_epu16(r, y)); | |
r = _mm_sub_epi16(r, _mm_and_si128(y, m)); | |
q = _mm_sub_epi16(q, m); | |
remainder = r; | |
return q; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment