Skip to content

Instantly share code, notes, and snippets.

@benanil
Last active June 12, 2023 15:51
Show Gist options
  • Save benanil/78ad3600f5e10b9a3f6173afc8565352 to your computer and use it in GitHub Desktop.
Save benanil/78ad3600f5e10b9a3f6173afc8565352 to your computer and use it in GitHub Desktop.
simd optimized memcpy and memset
#ifndef AXGLOBALCONST
# if _MSC_VER
# define AXGLOBALCONST extern const __declspec(selectany)
# elif defined(__GNUC__) && !defined(__MINGW32__)
# define AXGLOBALCONST extern const __attribute__((weak))
# else
# define AXGLOBALCONST
# endif
#endif
#if defined(__has_builtin)
# define AX_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
#else
# define AX_COMPILER_HAS_BUILTIN(x) 0
#endif
#if AX_COMPILER_HAS_BUILTIN(__builtin_prefetch)
# define AX_PREFETCH(x) __builtin_prefetch(x)
#elif defined(_MSC_VER)
# define AX_PREFETCH(x) _mm_prefetch(x, _MM_HINT_NTA)
#else
# define AX_PREFETCH(x)
#endif
/* Architecture Detection */
// detection code from mini audio
// you can define AX_NO_SSE2 or AX_NO_AVX2 in order to disable this extensions
#if defined(__x86_64__) || defined(_M_X64)
# define AX_X64
#elif defined(__i386) || defined(_M_IX86)
# define AX_X86
#elif defined(__arm__) || defined(_M_ARM) || defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
# define AX_ARM
#endif
/* Intrinsics Support */
#if (defined(AX_X64) || defined(AX_X86)) && !defined(__COSMOPOLITAN__)
#if defined(_MSC_VER) && !defined(__clang__)
#if _MSC_VER >= 1400 && !defined(AX_NO_SSE2) /* 2005 */
#define AX_SUPPORT_SSE2
#endif
#if _MSC_VER >= 1700 && !defined(AX_NO_AVX2) /* 2012 */
#define AX_SUPPORT_AVX2
#endif
#else
#if defined(__SSE2__) && !defined(AX_NO_SSE2)
#define AX_SUPPORT_SSE2
#endif
#if defined(__AVX2__) && !defined(AX_NO_AVX2)
#define AX_SUPPORT_AVX2
#endif
#endif
/* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
#if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
#if !defined(AX_SUPPORT_SSE2) && !defined(AX_NO_SSE2) && __has_include(<emmintrin.h>)
#define AX_SUPPORT_SSE2
#endif
#if !defined(AX_SUPPORT_AVX2) && !defined(AX_NO_AVX2) && __has_include(<immintrin.h>)
#define AX_SUPPORT_AVX2
#endif
#endif
#if defined(AX_SUPPORT_AVX2) || defined(AX_SUPPORT_AVX)
#include <immintrin.h>
#elif defined(AX_SUPPORT_SSE2)
#include <emmintrin.h>
#endif
#endif
#include <stdint.h>
#ifdef AX_SUPPORT_AVX2
AXGLOBALCONST uint64_t g_256MemMask[8] = { ~0ull, ~0ull, ~0ull, ~0ull, 0ull, 0ull, 0ull, 0ull };
#elif AX_SUPPORT_SSE2
AXGLOBALCONST uint32_t g_128MemMask[8] = { ~0u, ~0u, ~0u, ~0u, 0u, 0u, 0u, 0u };
#endif
inline void MemSet(void* dst, char val, uint64_t sizeInBytes)
{
uint64_t* dp = (uint64_t*)dst;
char* dcp = (char*)dp;
uint64_t d4;
#ifdef AX_SUPPORT_SSE2
const __m128i v4 = _mm_set1_epi8(val);
uint64_t d4 = v4.m128i_u64[0];
#ifdef AX_SUPPORT_AVX2
const __m256i v8 = _mm256_set1_epi64(d4);
#endif
#else
uint64_t uval = (uint64_t)val;
d4 = uval | (uval << 8ull) | (uval << 16ull) | (uval << 24ull);
d4 |= d4 << 32ull;
#endif
#ifdef AX_SUPPORT_AVX2
if (sizeInBytes <= 32)
{
// use __stob here
const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), v8, mask));
return;
}
if (!((uint64_t)dst & 31) && ((uint64_t)src & 31)) // is 32 byte aligned?
{
__m256i* dv = (__m256i*)dp;
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;
while (sizeInBytes >= (sizeof(__m256i) * 2))
{
_mm256_stream_si256(dv++, v8);
_mm256_stream_si256(dv++, v8);
sizeInBytes -= sizeof(__m256i) * 2;
}
}
else
#endif
#if defined(AX_SUPPORT_SSE2)
#ifndef AX_SUPPORT_AVX2
if (sizeInBytes <= 16)
{
// todo use __stob here and if set block to sizeInBytes <= 128
const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), v4, mask));
return;
}
#endif
if (!((uint64_t)dst & 15) && !((uint64_t)src & 15)) // is 16 byte aligned?
{
__m128i* dv = (__m128i*)dp;
dp += (sizeInBytes / (sizeof(__m128i) * 4)) * 8;
while (sizeInBytes >= (sizeof(__m128i) * 4))
{
_mm_stream_si128(dv++, v4);
_mm_stream_si128(dv++, v4);
_mm_stream_si128(dv++, v4);
_mm_stream_si128(dv++, v4);
sizeInBytes -= sizeof(__m128i) * 4;
}
}
else
#endif
while (sizeInBytes >= (sizeof(uint64_t) * 4))
{
dp[0] = dp[1] = dp[2] = dp[3] = d4;
dp += 4;
sizeInBytes -= sizeof(uint64_t) * 4;
}
while (sizeInBytes)
{
*dcp++ = val;
sizeInBytes--;
}
}
inline void MemCpy(void* dst, const void* src, uint64_t sizeInBytes)
{
uint64_t* dp = (uint64_t*)dst;
const uint64_t* sp = (const uint64_t*)src;
#ifdef AX_SUPPORT_AVX2
if (sizeInBytes <= 32) // if data is smaller than 32 byte we copy it without loop (simd)
{
// todo use __movesb here
const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), _mm256_loadu_epi8(dst), mask));
return;
}
if (!((uint64_t)src & 31) && !((uint64_t)dst & 31)) // is 32 byte aligned?
{
__m256i* dv = (__m256i*)dp;
const __m256i* sv = (const __m256i*)sp;
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;
while (sizeInBytes >= (sizeof(__m256i) * 2))
{
_mm256_stream_si256(dv++, _mm256_load_si256(sv++));
_mm256_stream_si256(dv++, _mm256_load_si256(sv++));
sizeInBytes -= sizeof(__m256i) * 2;
}
}
else
#endif
#if defined(AX_SUPPORT_SSE2)
#ifndef AX_SUPPORT_AVX2
if (sizeInBytes <= 16)
{
// use __movesb
const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes);
_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), _mm_loadu_epi8(dst), mask));
return;
}
#endif
if (!((uint64_t)src & 15) && !((uint64_t)dst & 15)) // is 16 byte aligned?
{
__m128i* dv = (__m128i*)dst;
__m128i* sv = (__m128i*)src;
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8;
while (sizeInBytes >= (sizeof(__m128i) * 4))
{
_mm_stream_si128(dv++, _mm_load_si128(sv++));
_mm_stream_si128(dv++, _mm_load_si128(sv++));
_mm_stream_si128(dv++, _mm_load_si128(sv++));
_mm_stream_si128(dv++, _mm_load_si128(sv++));
sizeInBytes -= sizeof(__m128i) * 4;
}
}
else
#endif
while (sizeInBytes >= (sizeof(uint64_t) * 4))
{
dp[0] = sp[0];
dp[1] = sp[1];
dp[2] = sp[2];
dp[3] = sp[3];
dp += 4; sp += 4;
sizeInBytes -= sizeof(uint64_t) * 4;
}
char* dcp = (char*)dp;
const char* scp = (const char*)sp;
while (sizeInBytes)
{
*dcp++ = *scp++;
sizeInBytes--;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment