Const-me/IncDecBench.cpp

## IncDecBench.cpp
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>

// Count of set bits in `plus` minus count of set bits in `minus`
// The result is in [ -32 .. +32 ] interval
inline int popCntDiff( uint32_t plus, uint32_t minus )
{
	plus = __popcnt( plus );
	minus = __popcnt( minus );
	return (int)plus - (int)minus;
}

// Horizontal sum of all 4 int64_t elements in the AVX2 vector
inline int64_t hadd_epi64( __m256i v32 )
{
	__m128i v = _mm256_extracti128_si256( v32, 1 );
	v = _mm_add_epi64( v, _mm256_castsi256_si128( v32 ) );
	const int64_t high = _mm_extract_epi64( v, 1 );
	const int64_t low = _mm_cvtsi128_si64( v );
	return high + low;
}

// AVX2 implementation of that algorithm
int computeWithAvx2( const char* input )
{
	// Create a few constants
	const __m256i zero = _mm256_setzero_si256();
	const __m256i s = _mm256_set1_epi8( 's' );
	const __m256i p = _mm256_set1_epi8( 'p' );

	__m256i counter;
	// Prologue, make sure the pointer is aligned by 32 bytes
	const size_t rem = ( (size_t)input ) % 32;
	if( 0 == rem )
	{
		// The input already aligned, initialize accumulator with 0
		counter = _mm256_setzero_si256();
	}
	else
	{
		// Load aligned vector from the address before the input buffer
		// Same VMEM page as the first byte of the input, no access violations
		input -= rem;
		const __m256i v = _mm256_load_si256( ( const __m256i* )input );

		uint32_t bmpZero = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, zero ) );
		uint32_t bmpPlus = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, s ) );
		uint32_t bmpMinus = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, p ) );
		// Discard lower bits resulted from loading before the start of the buffer
		bmpZero >>= (uint32_t)rem;
		bmpPlus >>= (uint32_t)rem;
		bmpMinus >>= (uint32_t)rem;

		if( 0 == bmpZero )
		{
			// No `\0` encountered in the initial bytes of the input
			// Compute initial value for the accumulator vector
			__m128i iv = _mm_cvtsi64_si128( popCntDiff( bmpPlus, bmpMinus ) * 0xFF );
			counter = _mm256_blend_epi32( zero, _mm256_castsi128_si256( iv ), 0b00000011 );
			input += 32;
		}
		else
		{
			// The input was tiny, found `\0` already
			// Clear higher bits in the two bitmaps which were after the first `\0`
			const uint32_t len = _tzcnt_u32( bmpZero );
			bmpPlus = _bzhi_u32( bmpPlus, len );
			bmpMinus = _bzhi_u32( bmpMinus, len );
			// Compute the result
			return popCntDiff( bmpPlus, bmpMinus );
		}
	}

	// The pointer is aligned by 32 bytes, which serves two purposes: we can use aligned loads,
	// and most importantly loading 32 bytes guarantees to not cross page boundary.
	// VMEM permissions are defined for aligned 4kb pages, we can technically load within a page without access violations,
	// despite the language standard says it's UB
	while( true )
	{
		// Load 32 bytes from the pointer
		const __m256i v = _mm256_load_si256( ( const __m256i* )input );

		// Compare bytes for v == '\0'
		const __m256i z = _mm256_cmpeq_epi8( v, zero );
		// Compare bytes for equality with these two other markers
		__m256i cmpPlus = _mm256_cmpeq_epi8( v, s );
		__m256i cmpMinus = _mm256_cmpeq_epi8( v, p );

		const uint32_t bmpZero = (uint32_t)_mm256_movemask_epi8( z );
		if( 0 != bmpZero )
		{
			// At least one byte of the 32 was zero
			const int res = (int)( hadd_epi64( counter ) / 0xFF );

			uint32_t bmpPlus = (uint32_t)_mm256_movemask_epi8( cmpPlus );
			uint32_t bmpMinus = (uint32_t)_mm256_movemask_epi8( cmpMinus );

			// Clear higher bits in the two bitmaps which were after the first found `\0`
			const uint32_t len = _tzcnt_u32( bmpZero );
			bmpPlus = _bzhi_u32( bmpPlus, len );
			bmpMinus = _bzhi_u32( bmpMinus, len );

			// Produce the result
			return res + popCntDiff( bmpPlus, bmpMinus );
		}

		// Increment the source pointer
		input += 32;

		// Compute horizontal sum of bytes within 8-byte lanes
		cmpPlus = _mm256_sad_epu8( cmpPlus, zero );
		cmpMinus = _mm256_sad_epu8( cmpMinus, zero );

		cmpPlus = _mm256_sub_epi64( cmpPlus, cmpMinus );
		// Update the counter
		counter = _mm256_add_epi64( counter, cmpPlus );
	}
}

#include <vector>
#include <random>

std::vector<char> nullTerminatedRandom( size_t length )
{
	std::vector<char> result;
	result.resize( length );
	// Deliberately seeding RNG with 0, to generate same output every time
	std::mt19937 gen( 0 );
	std::uniform_int_distribution<size_t> distrib( 0, 4 );
	const char pattern[ 4 ]{ 's', 'p', '0', '1' };
	for( char& c : result )
		c = pattern[ distrib( gen ) ];
	// Write terminating `\0` into the last element
	result[ length - 1 ] = '\0';
	return result;
}

int computeWithSwitches( const char* input )
{
	int res = 0;
	while( true )
	{
		char c = *input++;
		switch( c )
		{
		case '\0':
			return res;
		case 's':
			res += 1;
			break;
		case 'p':
			res -= 1;
			break;
		default:
			break;
		}
	}
}

int main()
{
	constexpr bool useAvx = true;

	// Using odd length slightly over 1GB, just for lulz
	const size_t len = 1024 * 1024 * 1024 + 17;
	const auto data = nullTerminatedRandom( len );
	const char* const rsi = data.data();

	// Compute the result using either of these two methods, measuring the time
	const int64_t tscStart = __rdtsc();
	const int sum = useAvx ? computeWithAvx2( rsi ) : computeWithSwitches( rsi );
	const int64_t tscElapsed = __rdtsc() - tscStart;

	printf( "%i; elapsed time: %lli\n", sum, tscElapsed );
	return 0;
}
	#include <stdint.h>
	#include <immintrin.h>
	#include <intrin.h>
	#include <stdio.h>

	// Count of set bits in `plus` minus count of set bits in `minus`
	// The result is in [ -32 .. +32 ] interval
	inline int popCntDiff( uint32_t plus, uint32_t minus )
	{
	plus = __popcnt( plus );
	minus = __popcnt( minus );
	return (int)plus - (int)minus;
	}

	// Horizontal sum of all 4 int64_t elements in the AVX2 vector
	inline int64_t hadd_epi64( __m256i v32 )
	{
	__m128i v = _mm256_extracti128_si256( v32, 1 );
	v = _mm_add_epi64( v, _mm256_castsi256_si128( v32 ) );
	const int64_t high = _mm_extract_epi64( v, 1 );
	const int64_t low = _mm_cvtsi128_si64( v );
	return high + low;
	}

	// AVX2 implementation of that algorithm
	int computeWithAvx2( const char* input )
	{
	// Create a few constants
	const __m256i zero = _mm256_setzero_si256();
	const __m256i s = _mm256_set1_epi8( 's' );
	const __m256i p = _mm256_set1_epi8( 'p' );

	__m256i counter;
	// Prologue, make sure the pointer is aligned by 32 bytes
	const size_t rem = ( (size_t)input ) % 32;
	if( 0 == rem )
	{
	// The input already aligned, initialize accumulator with 0
	counter = _mm256_setzero_si256();
	}
	else
	{
	// Load aligned vector from the address before the input buffer
	// Same VMEM page as the first byte of the input, no access violations
	input -= rem;
	const __m256i v = _mm256_load_si256( ( const __m256i* )input );

	uint32_t bmpZero = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, zero ) );
	uint32_t bmpPlus = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, s ) );
	uint32_t bmpMinus = (uint32_t)_mm256_movemask_epi8( _mm256_cmpeq_epi8( v, p ) );
	// Discard lower bits resulted from loading before the start of the buffer
	bmpZero >>= (uint32_t)rem;
	bmpPlus >>= (uint32_t)rem;
	bmpMinus >>= (uint32_t)rem;

	if( 0 == bmpZero )
	{
	// No `\0` encountered in the initial bytes of the input
	// Compute initial value for the accumulator vector
	__m128i iv = _mm_cvtsi64_si128( popCntDiff( bmpPlus, bmpMinus ) * 0xFF );
	counter = _mm256_blend_epi32( zero, _mm256_castsi128_si256( iv ), 0b00000011 );
	input += 32;
	}
	else
	{
	// The input was tiny, found `\0` already
	// Clear higher bits in the two bitmaps which were after the first `\0`
	const uint32_t len = _tzcnt_u32( bmpZero );
	bmpPlus = _bzhi_u32( bmpPlus, len );
	bmpMinus = _bzhi_u32( bmpMinus, len );
	// Compute the result
	return popCntDiff( bmpPlus, bmpMinus );
	}
	}

	// The pointer is aligned by 32 bytes, which serves two purposes: we can use aligned loads,
	// and most importantly loading 32 bytes guarantees to not cross page boundary.
	// VMEM permissions are defined for aligned 4kb pages, we can technically load within a page without access violations,
	// despite the language standard says it's UB
	while( true )
	{
	// Load 32 bytes from the pointer
	const __m256i v = _mm256_load_si256( ( const __m256i* )input );

	// Compare bytes for v == '\0'
	const __m256i z = _mm256_cmpeq_epi8( v, zero );
	// Compare bytes for equality with these two other markers
	__m256i cmpPlus = _mm256_cmpeq_epi8( v, s );
	__m256i cmpMinus = _mm256_cmpeq_epi8( v, p );

	const uint32_t bmpZero = (uint32_t)_mm256_movemask_epi8( z );
	if( 0 != bmpZero )
	{
	// At least one byte of the 32 was zero
	const int res = (int)( hadd_epi64( counter ) / 0xFF );

	uint32_t bmpPlus = (uint32_t)_mm256_movemask_epi8( cmpPlus );
	uint32_t bmpMinus = (uint32_t)_mm256_movemask_epi8( cmpMinus );

	// Clear higher bits in the two bitmaps which were after the first found `\0`
	const uint32_t len = _tzcnt_u32( bmpZero );
	bmpPlus = _bzhi_u32( bmpPlus, len );
	bmpMinus = _bzhi_u32( bmpMinus, len );

	// Produce the result
	return res + popCntDiff( bmpPlus, bmpMinus );
	}

	// Increment the source pointer
	input += 32;

	// Compute horizontal sum of bytes within 8-byte lanes
	cmpPlus = _mm256_sad_epu8( cmpPlus, zero );
	cmpMinus = _mm256_sad_epu8( cmpMinus, zero );

	cmpPlus = _mm256_sub_epi64( cmpPlus, cmpMinus );
	// Update the counter
	counter = _mm256_add_epi64( counter, cmpPlus );
	}
	}

	#include <vector>
	#include <random>

	std::vector<char> nullTerminatedRandom( size_t length )
	{
	std::vector<char> result;
	result.resize( length );
	// Deliberately seeding RNG with 0, to generate same output every time
	std::mt19937 gen( 0 );
	std::uniform_int_distribution<size_t> distrib( 0, 4 );
	const char pattern[ 4 ]{ 's', 'p', '0', '1' };
	for( char& c : result )
	c = pattern[ distrib( gen ) ];
	// Write terminating `\0` into the last element
	result[ length - 1 ] = '\0';
	return result;
	}

	int computeWithSwitches( const char* input )
	{
	int res = 0;
	while( true )
	{
	char c = *input++;
	switch( c )
	{
	case '\0':
	return res;
	case 's':
	res += 1;
	break;
	case 'p':
	res -= 1;
	break;
	default:
	break;
	}
	}
	}

	int main()
	{
	constexpr bool useAvx = true;

	// Using odd length slightly over 1GB, just for lulz
	const size_t len = 1024 * 1024 * 1024 + 17;
	const auto data = nullTerminatedRandom( len );
	const char* const rsi = data.data();

	// Compute the result using either of these two methods, measuring the time
	const int64_t tscStart = __rdtsc();
	const int sum = useAvx ? computeWithAvx2( rsi ) : computeWithSwitches( rsi );
	const int64_t tscElapsed = __rdtsc() - tscStart;

	printf( "%i; elapsed time: %lli\n", sum, tscElapsed );
	return 0;
	}