benanil/MemCpyMemSet.cpp

## MemCpyMemSet.cpp
#ifndef AXGLOBALCONST
#	if _MSC_VER
#		define AXGLOBALCONST extern const __declspec(selectany)
#	elif defined(__GNUC__) && !defined(__MINGW32__)
#		define AXGLOBALCONST extern const __attribute__((weak))
#   else
#       define AXGLOBALCONST
#	endif
#endif

#if defined(__has_builtin)
#   define AX_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
#else
#   define AX_COMPILER_HAS_BUILTIN(x) 0
#endif

#if AX_COMPILER_HAS_BUILTIN(__builtin_prefetch)
#   define AX_PREFETCH(x) __builtin_prefetch(x)
#elif defined(_MSC_VER)
#   define AX_PREFETCH(x) _mm_prefetch(x, _MM_HINT_NTA)
#else
#   define AX_PREFETCH(x)
#endif

/* Architecture Detection */
// detection code from mini audio
// you can define AX_NO_SSE2 or AX_NO_AVX2 in order to disable this extensions
#if defined(__x86_64__) || defined(_M_X64)
#   define AX_X64
#elif defined(__i386) || defined(_M_IX86)
#   define AX_X86
#elif defined(__arm__) || defined(_M_ARM) || defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
#   define AX_ARM
#endif

/* Intrinsics Support */
#if (defined(AX_X64) || defined(AX_X86)) && !defined(__COSMOPOLITAN__)
    #if defined(_MSC_VER) && !defined(__clang__)
        #if _MSC_VER >= 1400 && !defined(AX_NO_SSE2)   /* 2005 */
            #define AX_SUPPORT_SSE2
        #endif
        #if _MSC_VER >= 1700 && !defined(AX_NO_AVX2)   /* 2012 */
            #define AX_SUPPORT_AVX2
        #endif
    #else
        #if defined(__SSE2__) && !defined(AX_NO_SSE2)
            #define AX_SUPPORT_SSE2
        #endif
        #if defined(__AVX2__) && !defined(AX_NO_AVX2)
            #define AX_SUPPORT_AVX2
        #endif
    #endif

    /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
    #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
        #if !defined(AX_SUPPORT_SSE2)   && !defined(AX_NO_SSE2)   && __has_include(<emmintrin.h>)
            #define AX_SUPPORT_SSE2
        #endif
        #if !defined(AX_SUPPORT_AVX2)   && !defined(AX_NO_AVX2)   && __has_include(<immintrin.h>)
            #define AX_SUPPORT_AVX2
        #endif
    #endif

    #if defined(AX_SUPPORT_AVX2) || defined(AX_SUPPORT_AVX)
        #include <immintrin.h>
    #elif defined(AX_SUPPORT_SSE2)
        #include <emmintrin.h>
    #endif
#endif

#include <stdint.h>

#ifdef AX_SUPPORT_AVX2
AXGLOBALCONST uint64_t g_256MemMask[8] = { ~0ull, ~0ull, ~0ull, ~0ull, 0ull, 0ull, 0ull, 0ull };
#elif AX_SUPPORT_SSE2
AXGLOBALCONST uint32_t g_128MemMask[8] = { ~0u, ~0u, ~0u, ~0u, 0u, 0u, 0u, 0u };
#endif

inline void MemSet(void* dst, char val, uint64_t sizeInBytes)
{
    uint64_t* dp  = (uint64_t*)dst;
    char* dcp = (char*)dp;
    uint64_t d4;

#ifdef AX_SUPPORT_SSE2
    const __m128i  v4 = _mm_set1_epi8(val);
    uint64_t d4 = v4.m128i_u64[0];
    #ifdef AX_SUPPORT_AVX2
        const __m256i  v8 = _mm256_set1_epi64(d4);
    #endif
#else
    uint64_t uval = (uint64_t)val;
    d4 = uval | (uval << 8ull) | (uval << 16ull) | (uval << 24ull);
    d4 |= d4 << 32ull;
#endif

#ifdef AX_SUPPORT_AVX2
    if (sizeInBytes <= 32)
    {
        // use __stob here
        const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
        _mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), v8, mask));
        return;
    }
    if (!((uint64_t)dst & 31) && ((uint64_t)src & 31)) // is 32 byte aligned?
    {
        __m256i* dv = (__m256i*)dp;
        dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

        while (sizeInBytes >= (sizeof(__m256i) * 2))
        {
            _mm256_stream_si256(dv++, v8);
            _mm256_stream_si256(dv++, v8);
            sizeInBytes -= sizeof(__m256i) * 2;
        }
    }
    else
#endif
#if defined(AX_SUPPORT_SSE2)
    #ifndef AX_SUPPORT_AVX2
    if (sizeInBytes <= 16)
    {
        // todo use __stob here and if set block to sizeInBytes <= 128
        const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
        _mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), v4, mask));
        return;
    }
    #endif
    if (!((uint64_t)dst & 15) && !((uint64_t)src & 15)) // is 16 byte aligned?
    {
        __m128i* dv = (__m128i*)dp;
        dp += (sizeInBytes / (sizeof(__m128i) * 4)) * 8;

        while (sizeInBytes >= (sizeof(__m128i) * 4))
        {
            _mm_stream_si128(dv++, v4);
            _mm_stream_si128(dv++, v4);
            _mm_stream_si128(dv++, v4);
            _mm_stream_si128(dv++, v4);
            sizeInBytes -= sizeof(__m128i) * 4;
        }
    }
    else
#endif
    while (sizeInBytes >= (sizeof(uint64_t) * 4))
    {
        dp[0] = dp[1] = dp[2] = dp[3] = d4;
        dp += 4;
        sizeInBytes -= sizeof(uint64_t) * 4;
    }
    while (sizeInBytes)
    {
        *dcp++ = val;
        sizeInBytes--;
    }
}

inline void MemCpy(void* dst, const void* src, uint64_t sizeInBytes)
{
    uint64_t* dp = (uint64_t*)dst;
    const uint64_t* sp = (const uint64_t*)src;

#ifdef AX_SUPPORT_AVX2
    if (sizeInBytes <= 32) // if data is smaller than 32 byte we copy it without loop (simd)
    {
        // todo use __movesb here
        const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
        _mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), _mm256_loadu_epi8(dst), mask));
        return;
    }
    if (!((uint64_t)src & 31) && !((uint64_t)dst & 31)) // is 32 byte aligned?
    {
        __m256i* dv = (__m256i*)dp;
        const __m256i* sv = (const __m256i*)sp;
        dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

        while (sizeInBytes >= (sizeof(__m256i) * 2))
        {
            _mm256_stream_si256(dv++, _mm256_load_si256(sv++));
            _mm256_stream_si256(dv++, _mm256_load_si256(sv++));
            sizeInBytes -= sizeof(__m256i) * 2;
        }
    }
    else
#endif
#if defined(AX_SUPPORT_SSE2)
    #ifndef AX_SUPPORT_AVX2
    if (sizeInBytes <= 16)
    {
         // use __movesb
         const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
        _mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), _mm_loadu_epi8(dst), mask));
        return;
    }
    #endif
    if (!((uint64_t)src & 15) && !((uint64_t)dst & 15)) // is 16 byte aligned?
    {
        __m128i* dv = (__m128i*)dst;
        __m128i* sv = (__m128i*)src;
        dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

        while (sizeInBytes >= (sizeof(__m128i) * 4))
        {
            _mm_stream_si128(dv++, _mm_load_si128(sv++));
            _mm_stream_si128(dv++, _mm_load_si128(sv++));
            _mm_stream_si128(dv++, _mm_load_si128(sv++));
            _mm_stream_si128(dv++, _mm_load_si128(sv++));
            sizeInBytes -= sizeof(__m128i) * 4;
        }
    }
    else
#endif
    while (sizeInBytes >= (sizeof(uint64_t) * 4))
    {
        dp[0] = sp[0];
        dp[1] = sp[1];
        dp[2] = sp[2];
        dp[3] = sp[3];
        dp += 4; sp += 4;
        sizeInBytes -= sizeof(uint64_t) * 4;
    }

    char* dcp = (char*)dp;
    const char* scp = (const char*)sp;

    while (sizeInBytes)
    {
        *dcp++ = *scp++;
        sizeInBytes--;
    }
}
	#ifndef AXGLOBALCONST
	# if _MSC_VER
	# define AXGLOBALCONST extern const __declspec(selectany)
	# elif defined(__GNUC__) && !defined(__MINGW32__)
	# define AXGLOBALCONST extern const __attribute__((weak))
	# else
	# define AXGLOBALCONST
	# endif
	#endif

	#if defined(__has_builtin)
	# define AX_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
	#else
	# define AX_COMPILER_HAS_BUILTIN(x) 0
	#endif

	#if AX_COMPILER_HAS_BUILTIN(__builtin_prefetch)
	# define AX_PREFETCH(x) __builtin_prefetch(x)
	#elif defined(_MSC_VER)
	# define AX_PREFETCH(x) _mm_prefetch(x, _MM_HINT_NTA)
	#else
	# define AX_PREFETCH(x)
	#endif

	/* Architecture Detection */
	// detection code from mini audio
	// you can define AX_NO_SSE2 or AX_NO_AVX2 in order to disable this extensions
	#if defined(__x86_64__) \|\| defined(_M_X64)
	# define AX_X64
	#elif defined(__i386) \|\| defined(_M_IX86)
	# define AX_X86
	#elif defined(__arm__) \|\| defined(_M_ARM) \|\| defined(__arm64) \|\| defined(__arm64__) \|\| defined(__aarch64__) \|\| defined(_M_ARM64)
	# define AX_ARM
	#endif

	/* Intrinsics Support */
	#if (defined(AX_X64) \|\| defined(AX_X86)) && !defined(__COSMOPOLITAN__)
	#if defined(_MSC_VER) && !defined(__clang__)
	#if _MSC_VER >= 1400 && !defined(AX_NO_SSE2) /* 2005 */
	#define AX_SUPPORT_SSE2
	#endif
	#if _MSC_VER >= 1700 && !defined(AX_NO_AVX2) /* 2012 */
	#define AX_SUPPORT_AVX2
	#endif
	#else
	#if defined(__SSE2__) && !defined(AX_NO_SSE2)
	#define AX_SUPPORT_SSE2
	#endif
	#if defined(__AVX2__) && !defined(AX_NO_AVX2)
	#define AX_SUPPORT_AVX2
	#endif
	#endif

	/* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
	#if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
	#if !defined(AX_SUPPORT_SSE2) && !defined(AX_NO_SSE2) && __has_include(<emmintrin.h>)
	#define AX_SUPPORT_SSE2
	#endif
	#if !defined(AX_SUPPORT_AVX2) && !defined(AX_NO_AVX2) && __has_include(<immintrin.h>)
	#define AX_SUPPORT_AVX2
	#endif
	#endif

	#if defined(AX_SUPPORT_AVX2) \|\| defined(AX_SUPPORT_AVX)
	#include <immintrin.h>
	#elif defined(AX_SUPPORT_SSE2)
	#include <emmintrin.h>
	#endif
	#endif

	#include <stdint.h>

	#ifdef AX_SUPPORT_AVX2
	AXGLOBALCONST uint64_t g_256MemMask[8] = { ~0ull, ~0ull, ~0ull, ~0ull, 0ull, 0ull, 0ull, 0ull };
	#elif AX_SUPPORT_SSE2
	AXGLOBALCONST uint32_t g_128MemMask[8] = { ~0u, ~0u, ~0u, ~0u, 0u, 0u, 0u, 0u };
	#endif

	inline void MemSet(void* dst, char val, uint64_t sizeInBytes)
	{
	uint64_t* dp = (uint64_t*)dst;
	char* dcp = (char*)dp;
	uint64_t d4;

	#ifdef AX_SUPPORT_SSE2
	const __m128i v4 = _mm_set1_epi8(val);
	uint64_t d4 = v4.m128i_u64[0];
	#ifdef AX_SUPPORT_AVX2
	const __m256i v8 = _mm256_set1_epi64(d4);
	#endif
	#else
	uint64_t uval = (uint64_t)val;
	d4 = uval \| (uval << 8ull) \| (uval << 16ull) \| (uval << 24ull);
	d4 \|= d4 << 32ull;
	#endif

	#ifdef AX_SUPPORT_AVX2
	if (sizeInBytes <= 32)
	{
	// use __stob here
	const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
	_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), v8, mask));
	return;
	}
	if (!((uint64_t)dst & 31) && ((uint64_t)src & 31)) // is 32 byte aligned?
	{
	__m256i* dv = (__m256i*)dp;
	dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

	while (sizeInBytes >= (sizeof(__m256i) * 2))
	{
	_mm256_stream_si256(dv++, v8);
	_mm256_stream_si256(dv++, v8);
	sizeInBytes -= sizeof(__m256i) * 2;
	}
	}
	else
	#endif
	#if defined(AX_SUPPORT_SSE2)
	#ifndef AX_SUPPORT_AVX2
	if (sizeInBytes <= 16)
	{
	// todo use __stob here and if set block to sizeInBytes <= 128
	const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
	_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), v4, mask));
	return;
	}
	#endif
	if (!((uint64_t)dst & 15) && !((uint64_t)src & 15)) // is 16 byte aligned?
	{
	__m128i* dv = (__m128i*)dp;
	dp += (sizeInBytes / (sizeof(__m128i) * 4)) * 8;

	while (sizeInBytes >= (sizeof(__m128i) * 4))
	{
	_mm_stream_si128(dv++, v4);
	_mm_stream_si128(dv++, v4);
	_mm_stream_si128(dv++, v4);
	_mm_stream_si128(dv++, v4);
	sizeInBytes -= sizeof(__m128i) * 4;
	}
	}
	else
	#endif
	while (sizeInBytes >= (sizeof(uint64_t) * 4))
	{
	dp[0] = dp[1] = dp[2] = dp[3] = d4;
	dp += 4;
	sizeInBytes -= sizeof(uint64_t) * 4;
	}
	while (sizeInBytes)
	{
	*dcp++ = val;
	sizeInBytes--;
	}
	}

	inline void MemCpy(void* dst, const void* src, uint64_t sizeInBytes)
	{
	uint64_t* dp = (uint64_t*)dst;
	const uint64_t* sp = (const uint64_t*)src;

	#ifdef AX_SUPPORT_AVX2
	if (sizeInBytes <= 32) // if data is smaller than 32 byte we copy it without loop (simd)
	{
	// todo use __movesb here
	const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
	_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), _mm256_loadu_epi8(dst), mask));
	return;
	}
	if (!((uint64_t)src & 31) && !((uint64_t)dst & 31)) // is 32 byte aligned?
	{
	__m256i* dv = (__m256i*)dp;
	const __m256i* sv = (const __m256i*)sp;
	dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

	while (sizeInBytes >= (sizeof(__m256i) * 2))
	{
	_mm256_stream_si256(dv++, _mm256_load_si256(sv++));
	_mm256_stream_si256(dv++, _mm256_load_si256(sv++));
	sizeInBytes -= sizeof(__m256i) * 2;
	}
	}
	else
	#endif
	#if defined(AX_SUPPORT_SSE2)
	#ifndef AX_SUPPORT_AVX2
	if (sizeInBytes <= 16)
	{
	// use __movesb
	const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
	_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), _mm_loadu_epi8(dst), mask));
	return;
	}
	#endif
	if (!((uint64_t)src & 15) && !((uint64_t)dst & 15)) // is 16 byte aligned?
	{
	__m128i* dv = (__m128i*)dst;
	__m128i* sv = (__m128i*)src;
	dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;

	while (sizeInBytes >= (sizeof(__m128i) * 4))
	{
	_mm_stream_si128(dv++, _mm_load_si128(sv++));
	_mm_stream_si128(dv++, _mm_load_si128(sv++));
	_mm_stream_si128(dv++, _mm_load_si128(sv++));
	_mm_stream_si128(dv++, _mm_load_si128(sv++));
	sizeInBytes -= sizeof(__m128i) * 4;
	}
	}
	else
	#endif
	while (sizeInBytes >= (sizeof(uint64_t) * 4))
	{
	dp[0] = sp[0];
	dp[1] = sp[1];
	dp[2] = sp[2];
	dp[3] = sp[3];
	dp += 4; sp += 4;
	sizeInBytes -= sizeof(uint64_t) * 4;
	}

	char* dcp = (char*)dp;
	const char* scp = (const char*)sp;

	while (sizeInBytes)
	{
	dcp++ = scp++;
	sizeInBytes--;
	}
	}