Last active
June 12, 2023 15:51
-
-
Save benanil/78ad3600f5e10b9a3f6173afc8565352 to your computer and use it in GitHub Desktop.
simd optimized memcpy and memset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef AXGLOBALCONST | |
# if _MSC_VER | |
# define AXGLOBALCONST extern const __declspec(selectany) | |
# elif defined(__GNUC__) && !defined(__MINGW32__) | |
# define AXGLOBALCONST extern const __attribute__((weak)) | |
# else | |
# define AXGLOBALCONST | |
# endif | |
#endif | |
#if defined(__has_builtin) | |
# define AX_COMPILER_HAS_BUILTIN(x) __has_builtin(x) | |
#else | |
# define AX_COMPILER_HAS_BUILTIN(x) 0 | |
#endif | |
#if AX_COMPILER_HAS_BUILTIN(__builtin_prefetch) | |
# define AX_PREFETCH(x) __builtin_prefetch(x) | |
#elif defined(_MSC_VER) | |
# define AX_PREFETCH(x) _mm_prefetch(x, _MM_HINT_NTA) | |
#else | |
# define AX_PREFETCH(x) | |
#endif | |
/* Architecture Detection */ | |
// detection code from mini audio | |
// you can define AX_NO_SSE2 or AX_NO_AVX2 in order to disable this extensions | |
#if defined(__x86_64__) || defined(_M_X64) | |
# define AX_X64 | |
#elif defined(__i386) || defined(_M_IX86) | |
# define AX_X86 | |
#elif defined(__arm__) || defined(_M_ARM) || defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) | |
# define AX_ARM | |
#endif | |
/* Intrinsics Support */ | |
#if (defined(AX_X64) || defined(AX_X86)) && !defined(__COSMOPOLITAN__) | |
#if defined(_MSC_VER) && !defined(__clang__) | |
#if _MSC_VER >= 1400 && !defined(AX_NO_SSE2) /* 2005 */ | |
#define AX_SUPPORT_SSE2 | |
#endif | |
#if _MSC_VER >= 1700 && !defined(AX_NO_AVX2) /* 2012 */ | |
#define AX_SUPPORT_AVX2 | |
#endif | |
#else | |
#if defined(__SSE2__) && !defined(AX_NO_SSE2) | |
#define AX_SUPPORT_SSE2 | |
#endif | |
#if defined(__AVX2__) && !defined(AX_NO_AVX2) | |
#define AX_SUPPORT_AVX2 | |
#endif | |
#endif | |
/* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */ | |
#if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include) | |
#if !defined(AX_SUPPORT_SSE2) && !defined(AX_NO_SSE2) && __has_include(<emmintrin.h>) | |
#define AX_SUPPORT_SSE2 | |
#endif | |
#if !defined(AX_SUPPORT_AVX2) && !defined(AX_NO_AVX2) && __has_include(<immintrin.h>) | |
#define AX_SUPPORT_AVX2 | |
#endif | |
#endif | |
#if defined(AX_SUPPORT_AVX2) || defined(AX_SUPPORT_AVX) | |
#include <immintrin.h> | |
#elif defined(AX_SUPPORT_SSE2) | |
#include <emmintrin.h> | |
#endif | |
#endif | |
#include <stdint.h> | |
#ifdef AX_SUPPORT_AVX2 | |
AXGLOBALCONST uint64_t g_256MemMask[8] = { ~0ull, ~0ull, ~0ull, ~0ull, 0ull, 0ull, 0ull, 0ull }; | |
#elif AX_SUPPORT_SSE2 | |
AXGLOBALCONST uint32_t g_128MemMask[8] = { ~0u, ~0u, ~0u, ~0u, 0u, 0u, 0u, 0u }; | |
#endif | |
inline void MemSet(void* dst, char val, uint64_t sizeInBytes) | |
{ | |
uint64_t* dp = (uint64_t*)dst; | |
char* dcp = (char*)dp; | |
uint64_t d4; | |
#ifdef AX_SUPPORT_SSE2 | |
const __m128i v4 = _mm_set1_epi8(val); | |
uint64_t d4 = v4.m128i_u64[0]; | |
#ifdef AX_SUPPORT_AVX2 | |
const __m256i v8 = _mm256_set1_epi64(d4); | |
#endif | |
#else | |
uint64_t uval = (uint64_t)val; | |
d4 = uval | (uval << 8ull) | (uval << 16ull) | (uval << 24ull); | |
d4 |= d4 << 32ull; | |
#endif | |
#ifdef AX_SUPPORT_AVX2 | |
if (sizeInBytes <= 32) | |
{ | |
// use __stob here | |
const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes); | |
_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), v8, mask)); | |
return; | |
} | |
if (!((uint64_t)dst & 31) && ((uint64_t)src & 31)) // is 32 byte aligned? | |
{ | |
__m256i* dv = (__m256i*)dp; | |
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8; | |
while (sizeInBytes >= (sizeof(__m256i) * 2)) | |
{ | |
_mm256_stream_si256(dv++, v8); | |
_mm256_stream_si256(dv++, v8); | |
sizeInBytes -= sizeof(__m256i) * 2; | |
} | |
} | |
else | |
#endif | |
#if defined(AX_SUPPORT_SSE2) | |
#ifndef AX_SUPPORT_AVX2 | |
if (sizeInBytes <= 16) | |
{ | |
// todo use __stob here and if set block to sizeInBytes <= 128 | |
const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes); | |
_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), v4, mask)); | |
return; | |
} | |
#endif | |
if (!((uint64_t)dst & 15) && !((uint64_t)src & 15)) // is 16 byte aligned? | |
{ | |
__m128i* dv = (__m128i*)dp; | |
dp += (sizeInBytes / (sizeof(__m128i) * 4)) * 8; | |
while (sizeInBytes >= (sizeof(__m128i) * 4)) | |
{ | |
_mm_stream_si128(dv++, v4); | |
_mm_stream_si128(dv++, v4); | |
_mm_stream_si128(dv++, v4); | |
_mm_stream_si128(dv++, v4); | |
sizeInBytes -= sizeof(__m128i) * 4; | |
} | |
} | |
else | |
#endif | |
while (sizeInBytes >= (sizeof(uint64_t) * 4)) | |
{ | |
dp[0] = dp[1] = dp[2] = dp[3] = d4; | |
dp += 4; | |
sizeInBytes -= sizeof(uint64_t) * 4; | |
} | |
while (sizeInBytes) | |
{ | |
*dcp++ = val; | |
sizeInBytes--; | |
} | |
} | |
inline void MemCpy(void* dst, const void* src, uint64_t sizeInBytes) | |
{ | |
uint64_t* dp = (uint64_t*)dst; | |
const uint64_t* sp = (const uint64_t*)src; | |
#ifdef AX_SUPPORT_AVX2 | |
if (sizeInBytes <= 32) // if data is smaller than 32 byte we copy it without loop (simd) | |
{ | |
// todo use __movesb here | |
const __m256i mask = _mm256_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes); | |
_mm256_storeu_epi8(dst, _mm256_blendv_epi8(_mm256_loadu_epi8(src), _mm256_loadu_epi8(dst), mask)); | |
return; | |
} | |
if (!((uint64_t)src & 31) && !((uint64_t)dst & 31)) // is 32 byte aligned? | |
{ | |
__m256i* dv = (__m256i*)dp; | |
const __m256i* sv = (const __m256i*)sp; | |
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8; | |
while (sizeInBytes >= (sizeof(__m256i) * 2)) | |
{ | |
_mm256_stream_si256(dv++, _mm256_load_si256(sv++)); | |
_mm256_stream_si256(dv++, _mm256_load_si256(sv++)); | |
sizeInBytes -= sizeof(__m256i) * 2; | |
} | |
} | |
else | |
#endif | |
#if defined(AX_SUPPORT_SSE2) | |
#ifndef AX_SUPPORT_AVX2 | |
if (sizeInBytes <= 16) | |
{ | |
// use __movesb | |
const __m128i mask = _mm_loadu_epi8(((const char*)g_256MemMask) + sizeInBytes); | |
_mm_store_si128(dst, _mm_blendv_epi8(_mm_loadu_epi8(src), _mm_loadu_epi8(dst), mask)); | |
return; | |
} | |
#endif | |
if (!((uint64_t)src & 15) && !((uint64_t)dst & 15)) // is 16 byte aligned? | |
{ | |
__m128i* dv = (__m128i*)dst; | |
__m128i* sv = (__m128i*)src; | |
dp += (sizeInBytes / (sizeof(__m256i) * 2)) * 8; | |
while (sizeInBytes >= (sizeof(__m128i) * 4)) | |
{ | |
_mm_stream_si128(dv++, _mm_load_si128(sv++)); | |
_mm_stream_si128(dv++, _mm_load_si128(sv++)); | |
_mm_stream_si128(dv++, _mm_load_si128(sv++)); | |
_mm_stream_si128(dv++, _mm_load_si128(sv++)); | |
sizeInBytes -= sizeof(__m128i) * 4; | |
} | |
} | |
else | |
#endif | |
while (sizeInBytes >= (sizeof(uint64_t) * 4)) | |
{ | |
dp[0] = sp[0]; | |
dp[1] = sp[1]; | |
dp[2] = sp[2]; | |
dp[3] = sp[3]; | |
dp += 4; sp += 4; | |
sizeInBytes -= sizeof(uint64_t) * 4; | |
} | |
char* dcp = (char*)dp; | |
const char* scp = (const char*)sp; | |
while (sizeInBytes) | |
{ | |
*dcp++ = *scp++; | |
sizeInBytes--; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment