Skip to content

Instantly share code, notes, and snippets.

@IJzerbaard
Created February 4, 2020 07:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IJzerbaard/78d680151f47b72903a91dabe87633e5 to your computer and use it in GitHub Desktop.
Save IJzerbaard/78d680151f47b72903a91dabe87633e5 to your computer and use it in GitHub Desktop.
__m128i divu16(__m128i x, __m128i y, __m128i &remainder)
{
__m128i bitrev1 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
__m128i bitrev2 = _mm_slli_epi16(bitrev1, 4);
__m128i lownib = _mm_set1_epi8(15);
// swap adjacent bytes
__m128i bswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
__m128i yr = _mm_shuffle_epi8(y, bswap);
// swap adjacent nibbles and bit-reverse nibbles
yr = _mm_or_si128(
_mm_shuffle_epi8(bitrev1, _mm_and_si128(_mm_srli_epi16(yr, 4), lownib)),
_mm_shuffle_epi8(bitrev2, _mm_and_si128(yr, lownib)));
// isolate rightmost set bit
__m128i z = _mm_and_si128(yr, _mm_sub_epi16(_mm_setzero_si128(), yr));
// accurate to slightly less than 1 bit, need 4 Newton iterations
__m128i my =_mm_sub_epi16(_mm_setzero_si128(), y);
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my)));
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my)));
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my)));
z = _mm_add_epi16(z, _mm_mulhi_epu16(z, _mm_mullo_epi16(z, my)));
__m128i q = _mm_mulhi_epu16(x, z);
__m128i r = _mm_sub_epi16(x, _mm_mullo_epi16(y, q));
// adjust for residual inaccuracy
__m128i m;
m = _mm_cmpeq_epi16(r, _mm_max_epu16(r, y));
r = _mm_sub_epi16(r, _mm_and_si128(y, m));
q = _mm_sub_epi16(q, m);
m = _mm_cmpeq_epi16(r, _mm_max_epu16(r, y));
r = _mm_sub_epi16(r, _mm_and_si128(y, m));
q = _mm_sub_epi16(q, m);
remainder = r;
return q;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment