Skip to content

Instantly share code, notes, and snippets.

@pps83
Last active June 17, 2024 10:45
Show Gist options
  • Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
__builtin_ctz (ctzl, ctzll) and __builtin_clz (clzl, clzll) for Visual Studio
// Note, bsf/bsr are used by default.
// Enable /arch:AVX2 compilation for better optimizations
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
static __forceinline int __builtin_ctz(unsigned x)
{
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountTrailingZeros(x);
#elif defined(__AVX2__) || defined(__BMI__)
return (int)_tzcnt_u32(x);
#else
unsigned long r;
_BitScanForward(&r, x);
return (int)r;
#endif
}
static __forceinline int __builtin_ctzll(unsigned long long x)
{
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountTrailingZeros64(x);
#elif defined(_WIN64)
#if defined(__AVX2__) || defined(__BMI__)
return (int)_tzcnt_u64(x);
#else
unsigned long r;
_BitScanForward64(&r, x);
return (int)r;
#endif
#else
int l = __builtin_ctz((unsigned)x);
int h = __builtin_ctz((unsigned)(x >> 32)) + 32;
return !!((unsigned)x) ? l : h;
#endif
}
static __forceinline int __builtin_ctzl(unsigned long x)
{
return sizeof(x) == 8 ? __builtin_ctzll(x) : __builtin_ctz((unsigned)x);
}
static __forceinline int __builtin_clz(unsigned x)
{
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountLeadingZeros(x);
#elif defined(__AVX2__) || defined(__LZCNT__)
return (int)_lzcnt_u32(x);
#else
unsigned long r;
_BitScanReverse(&r, x);
return (int)(r ^ 31);
#endif
}
static __forceinline int __builtin_clzll(unsigned long long x)
{
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountLeadingZeros64(x);
#elif defined(_WIN64)
#if defined(__AVX2__) || defined(__LZCNT__)
return (int)_lzcnt_u64(x);
#else
unsigned long r;
_BitScanReverse64(&r, x);
return (int)(r ^ 63);
#endif
#else
int l = __builtin_clz((unsigned)x) + 32;
int h = __builtin_clz((unsigned)(x >> 32));
return !!((unsigned)(x >> 32)) ? h : l;
#endif
}
static __forceinline int __builtin_clzl(unsigned long x)
{
return sizeof(x) == 8 ? __builtin_clzll(x) : __builtin_clz((unsigned)x);
}
#endif // defined(_MSC_VER) && !defined(__clang__)
@gvanem
Copy link

gvanem commented Oct 19, 2021

There are no __lzcnt64() and _BitScanForward64() functions for x86 AFAICS. How about:

--- a/ctz_clz.cpp 2021-10-16 11:52:42
+++ b/ctz_clz.cpp 2021-10-19 09:15:54
@@ -14,7 +14,15 @@

 int __builtin_ctzll(unsigned long long x) {
     unsigned long ret;
+
+#ifdef _WIN64
     _BitScanForward64(&ret, x);
+#else
+    if (_BitScanForward(&ret, (uint32_t)x) == 0) { // low 32 bits are all zero.
+      _BitScanForward(&ret, (uint32_t)(x >> 32));
+      ret += 32;
+    }
+#endif
     return (int)ret;
 }

@@ -30,10 +38,17 @@
 }

 int __builtin_clzll(unsigned long long x) {
-    //unsigned long ret;
-    //_BitScanReverse64(&ret, x);
-    //return (int)(63 ^ ret);
+#ifdef _WIN64
     return (int)__lzcnt64(x);
+#else
+    unsigned long ret;
+    // Scan the high 32 bits.
+    if (_BitScanReverse(&ret, (uint32_t)(x >> 32)))
+       return (63 ^ (ret + 32));
+    // Scan the low 32 bits.
+    _BitScanReverse(&ret, (uint32_t)x);
+    return (63 ^ (int)ret);
+#endif
 }

 int __builtin_clzl(unsigned long x) {

Is this correct?

@mu578
Copy link

mu578 commented Feb 19, 2023

@gvanem yes something like that (sorry for my code with highly decorated indirections)

edit (better to see in context)

https://github.com/mu578/mu0/blob/master/mu0/mu0_definition/mu0_bitwiseop.h

@pps83
Copy link
Author

pps83 commented Feb 19, 2023

I updated the code to support 32/64 bit builds. It's BMI+ though (doesn't try to use bsr)

https://godbolt.org/z/G766KEM6j

@mu578
Copy link

mu578 commented Feb 19, 2023

@pps83 thanks see my updates too, still digging up between targets ; that's funny or not, this simple thing is @$##^&&% to get right ; just for getting a bit_width, all I want that's a portable bit_width operator jeez ; soft emulation takes no ROM ; maybe with a small lookup table I could shrink down the number of ops... whatever just want bit_width, can I get that easy... it seems not. ICC have _lzcnt_u64, _lzcnt_u32 and IBM XL __cntlz4, __cntlz8, but they have now a clang front-end so ; clang forever. ```update ARMCC has __rbit() and __clz() however that's a clang fork so... just logging for posterity.

For a soft emulation fallback (according to my CPU benchmarks) the following is the best
player:

__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
	__mu0_static__
	const unsigned char s_table[32] =
	{
		   0,  9,  1, 10, 13, 21,  2, 29
		, 11, 14, 16, 18, 22, 25,  3, 30
		,  8, 12, 20, 28, 15, 17, 24,  7
		, 19, 27, 23,  6, 26,  5,  4, 31
	};

	const unsigned int d = __mu0_bit_digits_i__();
	      unsigned int x = __x;
	if (x) {
		x = x | (x >>  1U);
		x = x | (x >>  2U);
		x = x | (x >>  4U);
		x = x | (x >>  8U);
		x = x | (x >> 16U);
		return (d - 1) - s_table[((x * 0x07C4ACDD) >> 27U) % d];
	}
	return d;
}

#	if 0

__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
	unsigned int x, d, y;
	if (__x) {
		x = __x;
		d = __mu0_bit_digits_i__();
		y = x >> 16U; if (y != 0U) { d = d - 16U; x = y; }
		y = x >>  8U; if (y != 0U) { d = d -  8U; x = y; }
		y = x >>  4U; if (y != 0U) { d = d -  4U; x = y; }
		y = x >>  2U; if (y != 0U) { d = d -  2U; x = y; }
		y = x >>  1U;
		return __mu0_cast__(int, ((y != 0U) ? d - 2U : d - x));
	}
	return __mu0_bit_digits_i__();
}

#	endif

__mu0_static_inline__
int __mu0_cntlz_ll__(const unsigned long long __x)
{
	const unsigned int d = __mu0_bit_digits_i__();
#	if MU0_HAVE_C99 || MU0_HAVE_CPP11
	return ((__x & 0xFFFFFFFF00000000ULL)
		?     __mu0_cntlz_i__(__x >> d)
		: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFULL)
	);
#	else
	return ((__x & 0xFFFFFFFF00000000U)
		?     __mu0_cntlz_i__(__x >> d)
		: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFU)
	);
#	endif
}

__mu0_static_inline__
int __mu0_cntlz_l__(const unsigned long __x)
{
#	if defined(__LP64__) || defined(__LP64)
	return __mu0_cntlz_ll__(__x);
#	else
	return __mu0_cntlz_i__(__x);
#	endif
}

@nickpelling
Copy link

nickpelling commented May 6, 2024

Hi Pavel,

This is really useful! I'd like to include this as a support file in my C container library.
https://github.com/nickpelling/C_STD

Is there a copyright or licence notice you'd like this published under? I use the MIT license because it's most permissive (I think):
https://github.com/nickpelling/C_STD?tab=MIT-1-ov-file

@pps83
Copy link
Author

pps83 commented May 6, 2024

Hi Pavel,

This is really useful! I'd like to include this as a support file in my C container library. https://github.com/nickpelling/C_STD

Is there a copyright or licence notice you'd like this published under? I use the MIT license because it's most permissive (I think): https://github.com/nickpelling/C_STD?tab=MIT-1-ov-file

OK to take it as is. Hopefully it's useful for users of C_STD.
Note, that the code uses _tzcnt_u32, _tzcnt_u64, _lzcnt_u32 _lzcnt_u64. AFAIK, these are BMI (or BMI2), so, it might be a good idea to use bsf variants instead for generic targets.

@nickpelling
Copy link

nickpelling commented May 6, 2024

Thanks very much, Pavel, that's perfect, you're a star! :-)

https://github.com/nickpelling/C_STD/blob/develop/THANKS.md

@pps83
Copy link
Author

pps83 commented May 7, 2024

Thanks very much, Pavel, that's perfect, you're a star! :-)

Note, I updated it to:

  • check for __AVX2__ builds to use lzcnt/tzcnt vs bsf/bsr
  • add code to handle arm/arm64
  • add guard for clang-cl

The code was tested to verify that it's bit exact with the code that clang/gcc emits for these builtins.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment