Skip to content

Instantly share code, notes, and snippets.

@pps83
Last active January 15, 2024 01:20
Show Gist options
  • Star 12 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
__builtin_ctz (ctzl, ctzll) and __builtin_clz (clzl, clzll) for Visual Studio
#ifdef _MSC_VER
#include <intrin.h>
static inline int __builtin_ctz(unsigned x)
{
return (int)_tzcnt_u32(x);
}
static inline int __builtin_ctzll(unsigned long long x)
{
#ifdef _WIN64
return (int)_tzcnt_u64(x);
#else
return !!unsigned(x) ? __builtin_ctz((unsigned)x) : 32 + __builtin_ctz((unsigned)(x >> 32));
#endif
}
static inline int __builtin_ctzl(unsigned long x)
{
return sizeof(x) == 8 ? __builtin_ctzll(x) : __builtin_ctz((unsigned)x);
}
static inline int __builtin_clz(unsigned x)
{
return (int)_lzcnt_u32(x);
}
static inline int __builtin_clzll(unsigned long long x)
{
#ifdef _WIN64
return (int)_lzcnt_u64(x);
#else
return !!unsigned(x >> 32) ? __builtin_clz((unsigned)(x >> 32)) : 32 + __builtin_clz((unsigned)x);
#endif
}
static inline int __builtin_clzl(unsigned long x)
{
return sizeof(x) == 8 ? __builtin_clzll(x) : __builtin_clz((unsigned)x);
}
#endif
@gvanem
Copy link

gvanem commented Oct 19, 2021

There are no __lzcnt64() and _BitScanForward64() functions for x86 AFAICS. How about:

--- a/ctz_clz.cpp 2021-10-16 11:52:42
+++ b/ctz_clz.cpp 2021-10-19 09:15:54
@@ -14,7 +14,15 @@

 int __builtin_ctzll(unsigned long long x) {
     unsigned long ret;
+
+#ifdef _WIN64
     _BitScanForward64(&ret, x);
+#else
+    if (_BitScanForward(&ret, (uint32_t)x) == 0) { // low 32 bits are all zero.
+      _BitScanForward(&ret, (uint32_t)(x >> 32));
+      ret += 32;
+    }
+#endif
     return (int)ret;
 }

@@ -30,10 +38,17 @@
 }

 int __builtin_clzll(unsigned long long x) {
-    //unsigned long ret;
-    //_BitScanReverse64(&ret, x);
-    //return (int)(63 ^ ret);
+#ifdef _WIN64
     return (int)__lzcnt64(x);
+#else
+    unsigned long ret;
+    // Scan the high 32 bits.
+    if (_BitScanReverse(&ret, (uint32_t)(x >> 32)))
+       return (63 ^ (ret + 32));
+    // Scan the low 32 bits.
+    _BitScanReverse(&ret, (uint32_t)x);
+    return (63 ^ (int)ret);
+#endif
 }

 int __builtin_clzl(unsigned long x) {

Is this correct?

@mu578
Copy link

mu578 commented Feb 19, 2023

@gvanem yes something like that (sorry for my code with highly decorated indirections)

edit (better to see in context)

https://github.com/mu578/mu0/blob/master/mu0/mu0_definition/mu0_bitwiseop.h

@pps83
Copy link
Author

pps83 commented Feb 19, 2023

I updated the code to support 32/64 bit builds. It's BMI+ though (doesn't try to use bsr)

https://godbolt.org/z/G766KEM6j

@mu578
Copy link

mu578 commented Feb 19, 2023

@pps83 thanks see my updates too, still digging up between targets ; that's funny or not, this simple thing is @$##^&&% to get right ; just for getting a bit_width, all I want that's a portable bit_width operator jeez ; soft emulation takes no ROM ; maybe with a small lookup table I could shrink down the number of ops... whatever just want bit_width, can I get that easy... it seems not. ICC have _lzcnt_u64, _lzcnt_u32 and IBM XL __cntlz4, __cntlz8, but they have now a clang front-end so ; clang forever. ```update ARMCC has __rbit() and __clz() however that's a clang fork so... just logging for posterity.

For a soft emulation fallback (according to my CPU benchmarks) the following is the best
player:

__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
	__mu0_static__
	const unsigned char s_table[32] =
	{
		   0,  9,  1, 10, 13, 21,  2, 29
		, 11, 14, 16, 18, 22, 25,  3, 30
		,  8, 12, 20, 28, 15, 17, 24,  7
		, 19, 27, 23,  6, 26,  5,  4, 31
	};

	const unsigned int d = __mu0_bit_digits_i__();
	      unsigned int x = __x;
	if (x) {
		x = x | (x >>  1U);
		x = x | (x >>  2U);
		x = x | (x >>  4U);
		x = x | (x >>  8U);
		x = x | (x >> 16U);
		return (d - 1) - s_table[((x * 0x07C4ACDD) >> 27U) % d];
	}
	return d;
}

#	if 0

__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
	unsigned int x, d, y;
	if (__x) {
		x = __x;
		d = __mu0_bit_digits_i__();
		y = x >> 16U; if (y != 0U) { d = d - 16U; x = y; }
		y = x >>  8U; if (y != 0U) { d = d -  8U; x = y; }
		y = x >>  4U; if (y != 0U) { d = d -  4U; x = y; }
		y = x >>  2U; if (y != 0U) { d = d -  2U; x = y; }
		y = x >>  1U;
		return __mu0_cast__(int, ((y != 0U) ? d - 2U : d - x));
	}
	return __mu0_bit_digits_i__();
}

#	endif

__mu0_static_inline__
int __mu0_cntlz_ll__(const unsigned long long __x)
{
	const unsigned int d = __mu0_bit_digits_i__();
#	if MU0_HAVE_C99 || MU0_HAVE_CPP11
	return ((__x & 0xFFFFFFFF00000000ULL)
		?     __mu0_cntlz_i__(__x >> d)
		: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFULL)
	);
#	else
	return ((__x & 0xFFFFFFFF00000000U)
		?     __mu0_cntlz_i__(__x >> d)
		: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFU)
	);
#	endif
}

__mu0_static_inline__
int __mu0_cntlz_l__(const unsigned long __x)
{
#	if defined(__LP64__) || defined(__LP64)
	return __mu0_cntlz_ll__(__x);
#	else
	return __mu0_cntlz_i__(__x);
#	endif
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment