Last active
January 15, 2024 01:20
-
-
Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
__builtin_ctz (ctzl, ctzll) and __builtin_clz (clzl, clzll) for Visual Studio
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef _MSC_VER | |
#include <intrin.h> | |
static inline int __builtin_ctz(unsigned x) | |
{ | |
return (int)_tzcnt_u32(x); | |
} | |
static inline int __builtin_ctzll(unsigned long long x) | |
{ | |
#ifdef _WIN64 | |
return (int)_tzcnt_u64(x); | |
#else | |
return !!unsigned(x) ? __builtin_ctz((unsigned)x) : 32 + __builtin_ctz((unsigned)(x >> 32)); | |
#endif | |
} | |
static inline int __builtin_ctzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_ctzll(x) : __builtin_ctz((unsigned)x); | |
} | |
static inline int __builtin_clz(unsigned x) | |
{ | |
return (int)_lzcnt_u32(x); | |
} | |
static inline int __builtin_clzll(unsigned long long x) | |
{ | |
#ifdef _WIN64 | |
return (int)_lzcnt_u64(x); | |
#else | |
return !!unsigned(x >> 32) ? __builtin_clz((unsigned)(x >> 32)) : 32 + __builtin_clz((unsigned)x); | |
#endif | |
} | |
static inline int __builtin_clzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_clzll(x) : __builtin_clz((unsigned)x); | |
} | |
#endif |
@pps83 thanks see my updates too, still digging up between targets ; that's funny or not, this simple thing is @$##^&&% to get right ; just for getting a bit_width
, all I want that's a portable bit_width
operator jeez ; soft emulation takes no ROM ; maybe with a small lookup table I could shrink down the number of ops... whatever just want bit_width, can I get that easy... it seems not. ICC have _lzcnt_u64, _lzcnt_u32 and IBM XL __cntlz4, __cntlz8, but they have now a clang front-end so ; clang forever. ```update ARMCC has __rbit() and __clz() however that's a clang fork so... just logging for posterity.
For a soft emulation fallback (according to my CPU benchmarks) the following is the best
player:
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
__mu0_static__
const unsigned char s_table[32] =
{
0, 9, 1, 10, 13, 21, 2, 29
, 11, 14, 16, 18, 22, 25, 3, 30
, 8, 12, 20, 28, 15, 17, 24, 7
, 19, 27, 23, 6, 26, 5, 4, 31
};
const unsigned int d = __mu0_bit_digits_i__();
unsigned int x = __x;
if (x) {
x = x | (x >> 1U);
x = x | (x >> 2U);
x = x | (x >> 4U);
x = x | (x >> 8U);
x = x | (x >> 16U);
return (d - 1) - s_table[((x * 0x07C4ACDD) >> 27U) % d];
}
return d;
}
# if 0
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
unsigned int x, d, y;
if (__x) {
x = __x;
d = __mu0_bit_digits_i__();
y = x >> 16U; if (y != 0U) { d = d - 16U; x = y; }
y = x >> 8U; if (y != 0U) { d = d - 8U; x = y; }
y = x >> 4U; if (y != 0U) { d = d - 4U; x = y; }
y = x >> 2U; if (y != 0U) { d = d - 2U; x = y; }
y = x >> 1U;
return __mu0_cast__(int, ((y != 0U) ? d - 2U : d - x));
}
return __mu0_bit_digits_i__();
}
# endif
__mu0_static_inline__
int __mu0_cntlz_ll__(const unsigned long long __x)
{
const unsigned int d = __mu0_bit_digits_i__();
# if MU0_HAVE_C99 || MU0_HAVE_CPP11
return ((__x & 0xFFFFFFFF00000000ULL)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFULL)
);
# else
return ((__x & 0xFFFFFFFF00000000U)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFU)
);
# endif
}
__mu0_static_inline__
int __mu0_cntlz_l__(const unsigned long __x)
{
# if defined(__LP64__) || defined(__LP64)
return __mu0_cntlz_ll__(__x);
# else
return __mu0_cntlz_i__(__x);
# endif
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I updated the code to support 32/64 bit builds. It's BMI+ though (doesn't try to use
bsr
)https://godbolt.org/z/G766KEM6j