-
-
Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
#ifdef _MSC_VER | |
#include <intrin.h> | |
static inline int __builtin_ctz(unsigned x) | |
{ | |
return (int)_tzcnt_u32(x); | |
} | |
static inline int __builtin_ctzll(unsigned long long x) | |
{ | |
#ifdef _WIN64 | |
return (int)_tzcnt_u64(x); | |
#else | |
return !!unsigned(x) ? __builtin_ctz((unsigned)x) : 32 + __builtin_ctz((unsigned)(x >> 32)); | |
#endif | |
} | |
static inline int __builtin_ctzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_ctzll(x) : __builtin_ctz((unsigned)x); | |
} | |
static inline int __builtin_clz(unsigned x) | |
{ | |
return (int)_lzcnt_u32(x); | |
} | |
static inline int __builtin_clzll(unsigned long long x) | |
{ | |
#ifdef _WIN64 | |
return (int)_lzcnt_u64(x); | |
#else | |
return !!unsigned(x >> 32) ? __builtin_clz((unsigned)(x >> 32)) : 32 + __builtin_clz((unsigned)x); | |
#endif | |
} | |
static inline int __builtin_clzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_clzll(x) : __builtin_clz((unsigned)x); | |
} | |
#endif |
@gvanem yes something like that (sorry for my code with highly decorated indirections)
edit (better to see in context)
https://github.com/mu578/mu0/blob/master/mu0/mu0_definition/mu0_bitwiseop.h
I updated the code to support 32/64 bit builds. It's BMI+ though (doesn't try to use bsr
)
@pps83 thanks see my updates too, still digging up between targets ; that's funny or not, this simple thing is @$##^&&% to get right ; just for getting a bit_width
, all I want that's a portable bit_width
operator jeez ; soft emulation takes no ROM ; maybe with a small lookup table I could shrink down the number of ops... whatever just want bit_width, can I get that easy... it seems not. ICC have _lzcnt_u64, _lzcnt_u32 and IBM XL __cntlz4, __cntlz8, but they have now a clang front-end so ; clang forever. ```update ARMCC has __rbit() and __clz() however that's a clang fork so... just logging for posterity.
For a soft emulation fallback (according to my CPU benchmarks) the following is the best
player:
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
__mu0_static__
const unsigned char s_table[32] =
{
0, 9, 1, 10, 13, 21, 2, 29
, 11, 14, 16, 18, 22, 25, 3, 30
, 8, 12, 20, 28, 15, 17, 24, 7
, 19, 27, 23, 6, 26, 5, 4, 31
};
const unsigned int d = __mu0_bit_digits_i__();
unsigned int x = __x;
if (x) {
x = x | (x >> 1U);
x = x | (x >> 2U);
x = x | (x >> 4U);
x = x | (x >> 8U);
x = x | (x >> 16U);
return (d - 1) - s_table[((x * 0x07C4ACDD) >> 27U) % d];
}
return d;
}
# if 0
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
unsigned int x, d, y;
if (__x) {
x = __x;
d = __mu0_bit_digits_i__();
y = x >> 16U; if (y != 0U) { d = d - 16U; x = y; }
y = x >> 8U; if (y != 0U) { d = d - 8U; x = y; }
y = x >> 4U; if (y != 0U) { d = d - 4U; x = y; }
y = x >> 2U; if (y != 0U) { d = d - 2U; x = y; }
y = x >> 1U;
return __mu0_cast__(int, ((y != 0U) ? d - 2U : d - x));
}
return __mu0_bit_digits_i__();
}
# endif
__mu0_static_inline__
int __mu0_cntlz_ll__(const unsigned long long __x)
{
const unsigned int d = __mu0_bit_digits_i__();
# if MU0_HAVE_C99 || MU0_HAVE_CPP11
return ((__x & 0xFFFFFFFF00000000ULL)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFULL)
);
# else
return ((__x & 0xFFFFFFFF00000000U)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFU)
);
# endif
}
__mu0_static_inline__
int __mu0_cntlz_l__(const unsigned long __x)
{
# if defined(__LP64__) || defined(__LP64)
return __mu0_cntlz_ll__(__x);
# else
return __mu0_cntlz_i__(__x);
# endif
}
There are no
__lzcnt64()
and_BitScanForward64()
functions for x86 AFAICS. How about:Is this correct?