-
-
Save graphitemaster/5917148fa6c6c2fc59d9e5a2256ae0dd to your computer and use it in GitHub Desktop.
A fast memory copy with a slew of optimizations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The following is an extremely optimized memcpy with several optimizations. | |
// | |
// * SSE2 for 16-byte copies, up to 128-byte unrolled. Blocks larger than | |
// 128-bytes are processed 128-bytes at a time. Copies larger than L2 cache | |
// size use streaming stores to avoid polluting-cache. Prefetching hints are | |
// used for large copies, prefetching up to 512-bytes at a time. | |
// | |
// * AVX for 32-byte copies, up to 256-byte unrolled. Blocks larger than | |
// 256-bytes are processed 256-bytes at a time. Copies larger than L2 cache | |
// size use streaming stores to avoid polluting-cache. Prefetching hints are | |
// used for large copies, prefetching up to 256-bytes at a time. | |
// | |
// * Scalar load and store up to 4-byte on 32-bit and 8-byte on 64-bit are used | |
// for general non-vector copies, and remainder bytes, unrolled to 16-bytes at | |
// a time on 64-bit systems, 8-bytes on 32-bit systems. | |
// | |
// * Where possible, aligned load and stores are used by both scalar and vector | |
// routes, of every possible sized copy, leading to combitronic explosion of | |
// code size as everything is heavily specialized by both source and | |
// destination alignment, from as high as 256-byte alignment (for AVX) down to | |
// 2-byte alignment (Uint16 scalar). | |
// | |
// * Where possible, on systems that allow unaligned load and store of words, | |
// unaligned routines here use that to avoid generating words with shifts and | |
// masks. Shifts and masks are generally only faster on those systems if | |
// the have barrel shifter hardware, which don't exist. | |
// | |
// * Forcing of inlining is used aggresively to get serial sleds of move | |
// instructions avoiding jump instructions as much as possible, especially | |
// within massive dispatch switch tables. Similarly large kernel functions | |
// which exchaust all vector registers are explicity marked to prevent | |
// inlining so that call instructions are always generated instead, as | |
// inlining leads to a lot of register spilling, ruining the kernel's | |
// performance. | |
// | |
// * Tables for function dispatch are used instead of a bunch of conditional | |
// branching to make use of conditional move instructions on systems which | |
// support it, and because it keeps a lot of cold code out of the dispatch. | |
// | |
// * Unaligned destination for scalar code is handled by performing a very small | |
// unaligned byte copy until alignment, tracking the unaligned byte(s) in an | |
// an appropriate-sized word for word-copies, interleaving shifts and adds | |
// for those bytes. This optimization turns out to pipeline well for scalar | |
// out-of-order processor, but applying the same optimization for vector code | |
// is worse as it involves a lot of pack and unpack instructions which | |
// penalize vector pipelining, so unaligned destination is done with unaligned | |
// vector instructions instead, with the 15-byte remainder done with scalar | |
// copies instead. | |
// | |
// * Alignment of source and destination pointers are calculated by counting | |
// the number of zero bits in the pointer, the count is log2(alignment), this | |
// index is used to dispatch alignment function tables. Computing this is | |
// possible with native popcount instructions, on systems that lack such | |
// instructions, it becomes a very fast constant time look-up within a | |
// De Bruijn sequence table and constant multiplier, rather than using | |
// conditional branches or moves. | |
// | |
// * On modern x86 the use of "rep movsb" isfaster in small to, medium-sized | |
// copies. Large copies it tends to trash L2 cache and is slower than SSE2 or | |
// AVX with streaming store. The unrolled scalar 16-byte => .... => 2-byte | |
// copy optimizes to "rep movsb" in gcc and clang at all optimization levels, | |
// (including debug) when conditionally guarded for N <= 32 which it is, so | |
// the optimization is not explicitly done here. | |
// | |
// * Lot of strict-aliasing is violated here to allow for word-copies, this | |
// strictly invokes undefined-behavior, however with the use of compiler | |
// intrinsics it's possible to make it implementation-defined behavior. This | |
// is done aggressively and carefully to avoid strict-aliasing optimizations | |
// from miscompiling this code. Look for RX_HINT_MAY_ALIAS. | |
// | |
// * Since copies cannot overlap (that's what "memmove" is for), restrict | |
// pointers are used aggressively as well to avoid unnecessary generation of | |
// additional load instructions in the scalar code. | |
#include "rx/core/assert.h" // RX_ASSERT | |
#include "rx/core/config.h" // RX_{ARCHITECTURE,COMPILLER}_* | |
#include "rx/core/hints/force_inline.h" // RX_HINT_FORCE_INLINE | |
#include "rx/core/hints/no_inline.h" // RX_HINT_NO_INLINE | |
#include "rx/core/hints/may_alias.h" // RX_HINT_MAY_ALIAS | |
#include "rx/core/hints/restrict.h" // RX_HINT_RESTRICT | |
#include "rx/core/utility/bit.h" // bit_search_lsb | |
#include "rx/core/algorithm/min.h" // Algorithm::min | |
// We do not support x86 without SSE2 support. | |
#if defined(RX_ARCHITECTURE_AMD64) || defined(RX_ARCHITECTURE_X86) | |
#define USE_SSE2 | |
// #define USE_AVX | |
#define ALLOW_UNALIGNED_SCALAR_LOAD | |
#define ALLOW_UNALIGNED_SCALAR_STORE | |
#elif defined(RX_ARCHITECTURE_AARCH64) | |
#define ALLOW_UNALIGNED_SCALAR_LOAD | |
#define ALLOW_UNALIGNED_SCALAR_STORE | |
#elif defined(RX_ARCHITECTURE_WASM32) | |
// Emscripten supports SSE2 on WASM32, but WASM32 is extremely slow when you | |
// use unaligned scalar load and stores since it's implemented in browsers with | |
// a typed array and sveral typed arrays views which causes tier-up to generate | |
// a ton of shifts and masking rather, so try to avoid that. | |
#define USE_SSE2 | |
#endif | |
#if defined(USE_SSE2) | |
#include <emmintrin.h> | |
#endif // defined(USE_SSE2) | |
#if defined(USE_AVX) | |
#include <immintrin.h> | |
#endif // defined(USE_AVX) | |
namespace Rx::Memory { | |
// The unrolled small copy size to use depending on what features are selected. | |
#if defined(USE_AVX) | |
static inline constexpr const Size SMALL_SIZE = 256; | |
#elif defined(USE_SSE2) | |
static inline constexpr const Size SMALL_SIZE = 128; | |
#else | |
static inline constexpr const Size SMALL_SIZE = 64; | |
#endif | |
RX_HINT_FORCE_INLINE static Uint16 loada16(const Byte *RX_HINT_RESTRICT _src) { | |
typedef Uint16 RX_HINT_MAY_ALIAS Word; | |
return *reinterpret_cast<const Word*>(_src); | |
} | |
RX_HINT_FORCE_INLINE static Uint32 loada32(const Byte* RX_HINT_RESTRICT _src) { | |
typedef Uint32 RX_HINT_MAY_ALIAS Word; | |
return *reinterpret_cast<const Word*>(_src); | |
} | |
RX_HINT_FORCE_INLINE static Uint64 loada64(const Byte* RX_HINT_RESTRICT _src) { | |
typedef Uint64 RX_HINT_MAY_ALIAS Word; | |
return *reinterpret_cast<const Word*>(_src); | |
} | |
// Helper functiions to load unaligned quantities. | |
RX_HINT_FORCE_INLINE static Uint16 loadu16(const Byte *RX_HINT_RESTRICT _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
return loada16(_src); | |
#else | |
return (Uint16(_src[0]) << 8) | Uint16(_src[1]); | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
} | |
RX_HINT_FORCE_INLINE static Uint32 loadu32(const Byte* RX_HINT_RESTRICT _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
return loada32(_src); | |
#else | |
return (Uint32(_src[0]) << 24) | (Uint32(_src[1]) << 16) | |
| (Uint32(_src[2]) << 8) | Uint32(_src[3]); | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
} | |
RX_HINT_FORCE_INLINE static Uint64 loadu64(const Byte* RX_HINT_RESTRICT _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
return loada64(_src); | |
#else | |
return (Uint64(_src[0]) << 56) | (Uint64(_src[1]) << 48) | |
| (Uint64(_src[2]) << 40) | (Uint64(_src[3]) << 32) | |
| (Uint64(_src[4]) << 24) | (Uint64(_src[5]) << 16) | |
| (Uint64(_src[6]) << 8) | Uint64(_src[7]); | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_LOAD) | |
} | |
RX_HINT_FORCE_INLINE static void storea16(Byte *RX_HINT_RESTRICT dst_, Uint16 _src) { | |
typedef Uint16 RX_HINT_MAY_ALIAS Word; | |
*reinterpret_cast<Word*>(dst_) = _src; | |
} | |
RX_HINT_FORCE_INLINE static void storea32(Byte *RX_HINT_RESTRICT dst_, Uint32 _src) { | |
typedef Uint32 RX_HINT_MAY_ALIAS Word; | |
*reinterpret_cast<Word*>(dst_) = _src; | |
} | |
RX_HINT_FORCE_INLINE static void storea64(Byte *RX_HINT_RESTRICT dst_, Uint64 _src) { | |
typedef Uint64 RX_HINT_MAY_ALIAS Word; | |
*reinterpret_cast<Word*>(dst_) = _src; | |
} | |
RX_HINT_FORCE_INLINE static void storeu16(Byte *RX_HINT_RESTRICT dst_, Uint16 _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
storea16(dst_, _src); | |
#else | |
const auto src = reinterpret_cast<const Byte*>(&_src); | |
dst_[0] = src[0]; | |
dst_[1] = src[0]; | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
} | |
RX_HINT_FORCE_INLINE static void storeu32(Byte *RX_HINT_RESTRICT dst_, Uint32 _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
storea32(dst_, _src); | |
#else | |
const auto src = reinterpret_cast<const Byte*>(&_src); | |
dst_[0] = src[0]; | |
dst_[1] = src[1]; | |
dst_[2] = src[2]; | |
dst_[3] = src[3]; | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
} | |
RX_HINT_FORCE_INLINE static void storeu64(Byte *RX_HINT_RESTRICT dst_, Uint64 _src) { | |
#if defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
storea64(dst_, _src); | |
#else | |
const auto src = reinterpret_cast<const Byte*>(&_src); | |
dst_[0] = src[0]; | |
dst_[1] = src[1]; | |
dst_[2] = src[2]; | |
dst_[3] = src[3]; | |
dst_[4] = src[4]; | |
dst_[5] = src[5]; | |
dst_[6] = src[6]; | |
dst_[7] = src[7]; | |
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, Size E> | |
RX_HINT_FORCE_INLINE static void copy_untyped_scalar(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
static_assert(E <= 8, "too large"); | |
switch (E) { | |
case 8: // 8 | |
// On 32-bit systems the use of 8-byte load and store tends to be slower, | |
// use 32-bit load store here. This optimization cannot be done inside | |
// {store,load}{a,u}64 since those operate with words and would involve | |
// shifts and masking which would produce worse code. | |
// | |
// This is a significant performance improvement for WASM32. | |
if constexpr (sizeof(void*) == 8) { | |
if constexpr (SRC_ALIGNMENT % 8 == 0) { | |
if constexpr (DST_ALIGNMENT % 8 == 0) { | |
storea64(dst_ + 0, loada64(_src + 0)); | |
} else { | |
storeu64(dst_ + 0, loada64(_src + 0)); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 8 == 0) { | |
storea64(dst_ + 0, loadu64(_src + 0)); | |
} else { | |
storeu64(dst_ + 0, loadu64(_src + 0)); | |
} | |
} | |
} else { | |
if constexpr (SRC_ALIGNMENT % 4 == 0) { | |
if constexpr (DST_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loada32(_src + 0)); | |
storea32(dst_ + 4, loada32(_src + 4)); | |
} else { | |
storeu32(dst_ + 0, loada32(_src + 0)); | |
storeu32(dst_ + 4, loada32(_src + 4)); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loadu32(_src + 0)); | |
storea32(dst_ + 4, loadu32(_src + 4)); | |
} else { | |
storeu32(dst_ + 0, loadu32(_src + 0)); | |
storeu32(dst_ + 4, loadu32(_src + 4)); | |
} | |
} | |
} | |
break; | |
case 7: // 4 + 2 + 1 | |
dst_[6] = _src[6]; | |
[[fallthrough]]; | |
case 6: // 4 + 2 | |
if constexpr (SRC_ALIGNMENT % 4 == 0) { | |
if constexpr (DST_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loada32(_src + 0)); | |
storea16(dst_ + 4, loada16(_src + 4)); | |
} else { | |
storeu32(dst_ + 0, loada32(_src + 0)); | |
storeu16(dst_ + 4, loada16(_src + 4)); | |
} | |
} else { | |
if constexpr (SRC_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loadu32(_src + 0)); | |
storea16(dst_ + 4, loadu16(_src + 4)); | |
} else { | |
storeu32(dst_ + 0, loadu32(_src + 0)); | |
storeu16(dst_ + 4, loadu16(_src + 4)); | |
} | |
} | |
break; | |
case 5: // 4 + 1 | |
dst_[4] = _src[4]; | |
[[fallthrough]]; | |
case 4: // 4 | |
if constexpr (SRC_ALIGNMENT % 4 == 0) { | |
if constexpr (DST_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loada32(_src + 0)); | |
} else { | |
storeu32(dst_ + 0, loada32(_src + 0)); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 4 == 0) { | |
storea32(dst_ + 0, loadu32(_src + 0)); | |
} else { | |
storeu32(dst_ + 0, loadu32(_src + 0)); | |
} | |
} | |
break; | |
case 3: // 2 + 1 | |
dst_[2] = _src[2]; | |
[[fallthrough]]; | |
case 2: // 2 | |
if constexpr (SRC_ALIGNMENT % 2 == 0) { | |
if constexpr (DST_ALIGNMENT % 2 == 0) { | |
storea16(dst_ + 0, loada16(_src + 0)); | |
} else { | |
storeu16(dst_ + 0, loada16(_src + 0)); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 2 == 0) { | |
storea16(dst_ + 0, loadu16(_src + 0)); | |
} else { | |
storeu16(dst_ + 0, loadu16(_src + 0)); | |
} | |
} | |
break; | |
case 1: | |
dst_[0] = _src[0]; | |
break; | |
} | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, Size E> | |
RX_HINT_FORCE_INLINE static void copy_untyped_scalar_small(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
if constexpr (E <= 8) { | |
return copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, E>(dst_, _src); | |
} else switch (E) { | |
// Handle copies from size [9, 16] bytes with this. | |
case 16: // 8 + 8 | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0); | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 8, _src + 8); | |
break; | |
case 15: // 8 + 4 + 2 + 1 | |
dst_[14] = _src[14]; | |
[[fallthrough]]; | |
case 14: // 8 + 4 + 2 | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0); | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 4>(dst_ + 8, _src + 8); | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 2>(dst_ + 12, _src + 12); | |
break; | |
case 13: // 8 + 4 + 1 | |
dst_[12] = _src[12]; | |
[[fallthrough]]; | |
case 12: // 8 + 4 | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0); | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 4>(dst_ + 8, _src + 8); | |
break; | |
case 11: // 8 + 2 + 1 | |
dst_[10] = _src[10]; | |
[[fallthrough]]; | |
case 10: // 8 + 2 | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0); | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 2>(dst_ + 8, _src + 8); | |
break; | |
case 9: | |
copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0); | |
dst_[8] = _src[8]; | |
break; | |
} | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_vector_16(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
#if defined(USE_SSE2) | |
if constexpr (SRC_ALIGNMENT % 16 == 0) { | |
if constexpr (DST_ALIGNMENT % 16 == 0) { | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst_), _mm_load_si128(reinterpret_cast<const __m128i*>(_src))); | |
} else { | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_), _mm_load_si128(reinterpret_cast<const __m128i*>(_src))); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 16 == 0) { | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst_), _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src))); | |
} else { | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_), _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src))); | |
} | |
} | |
#else | |
copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 16>(dst_, _src); | |
#endif | |
} | |
// Do not inline these. | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_vector_32(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
#if defined(USE_AVX) | |
if constexpr (SRC_ALIGNMENT % 32 == 0) { | |
if constexpr (DST_ALIGNMENT % 32 == 0) { | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst_), _mm256_load_si256(reinterpret_cast<const __m256i*>(_src))); | |
} else { | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_), _mm256_load_si256(reinterpret_cast<const __m256i*>(_src))); | |
} | |
} else { | |
if constexpr (DST_ALIGNMENT % 32 == 0) { | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst_), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_src))); | |
} else { | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_src))); | |
} | |
} | |
#else | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 16, _src + 16); | |
#endif | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_vector_64(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 32, _src + 32); | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_vector_128(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64); | |
} | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_vector_256(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) { | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
} | |
// To reduce code size depending on SIMD selection, only emit a dispatch here for | |
// sizes up to 64 in SCALAR mode, 128 in SSE2 mode, and 256 in AVX mode. | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT> | |
RX_HINT_FORCE_INLINE static void copy_untyped_small(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src, Size _bytes) { | |
// Handle up to 15 tail bytes with scalar small copies until next multiple for | |
// vectorization. Each case here shares a common vector copy up to the tail, | |
// it's done first since the order tends to matter for better optimization. | |
#define CASE(index, offset, ...) \ | |
case index - 0: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 15>(dst_ + offset, _src + offset); break; \ | |
case index - 1: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 14>(dst_ + offset, _src + offset); break; \ | |
case index - 2: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 13>(dst_ + offset, _src + offset); break; \ | |
case index - 3: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 12>(dst_ + offset, _src + offset); break; \ | |
case index - 4: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 11>(dst_ + offset, _src + offset); break; \ | |
case index - 5: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 10>(dst_ + offset, _src + offset); break; \ | |
case index - 6: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 9>(dst_ + offset, _src + offset); break; \ | |
case index - 7: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + offset, _src + offset); break; \ | |
case index - 8: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 7>(dst_ + offset, _src + offset); break; \ | |
case index - 9: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 6>(dst_ + offset, _src + offset); break; \ | |
case index - 10: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 5>(dst_ + offset, _src + offset); break; \ | |
case index - 11: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 4>(dst_ + offset, _src + offset); break; \ | |
case index - 12: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 3>(dst_ + offset, _src + offset); break; \ | |
case index - 13: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 2>(dst_ + offset, _src + offset); break; \ | |
case index - 14: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 1>(dst_ + offset, _src + offset); break; \ | |
case offset: __VA_ARGS__ break | |
switch (_bytes) { | |
#if defined(USE_AVX) | |
case 256: | |
copy_untyped_vector_256<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
break; | |
CASE(255, 240, { | |
// 128 + 64 + 32 + 16 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 224, _src + 224); | |
}); | |
CASE(239, 224, { | |
// 128 + 64 + 32 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192); | |
}); | |
CASE(223, 208, { | |
// 128 + 64 + 16 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192); | |
}); | |
CASE(207, 192, { | |
// 128 + 64 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
}); | |
CASE(191, 176, { | |
// 128 + 32 + 16 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 160, _src + 160); | |
}); | |
CASE(175, 160, { | |
// 128 + 32 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
}); | |
CASE(159, 144, { | |
// 128 + 16 | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128); | |
}); | |
CASE(143, 128, { | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
}); | |
#endif // defined(USE_AVX) | |
#if defined(USE_SSE2) | |
// AVX handles cases 148 down to 128, when AVX is disabled we need to have | |
// 128th case here. | |
#if !defined(USE_AVX) | |
case 128: | |
copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
break; | |
#endif // !defined(USE_AVX) | |
CASE(127, 112, { | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 96, _src + 96); | |
}); | |
CASE(111, 96, { | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64); | |
}); | |
CASE(95, 80, { | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64); | |
}); | |
CASE(79, 64, { | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
}); | |
#endif // defined(USE_SSE2) | |
// SSE2 handles cases 79 down to 64, when SSE2 is disabled we need to have | |
// a lone 64th case here manually. | |
#if !defined(USE_SSE2) | |
case 64: | |
copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
break; | |
#endif // !defined(USE_SSE2) | |
CASE(63, 48, { | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 32, _src + 32); | |
}); | |
CASE(47, 32, { | |
copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
}); | |
CASE(31, 16, { | |
copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0); | |
}); | |
CASE(15, 0, {}); | |
} | |
} | |
RX_HINT_FORCE_INLINE static void copy_untyped_small_dispatch(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src, Size _bytes) { | |
// Count the number of trailing zero bits to determine alignment of pointer. What ever this | |
// number is raised to a power of two is the alignment of the pointer. | |
const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z); | |
const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(_src)), 5_z); | |
// TABLE[src_alignment][dst_alignment] | |
static constexpr void (*const TABLE[6][6])(Byte *RX_HINT_RESTRICT, const Byte *RX_HINT_RESTRICT, Size) = { | |
{ | |
©_untyped_small<1, 1>, ©_untyped_small<1, 2>, | |
©_untyped_small<1, 4>, ©_untyped_small<1, 8>, | |
©_untyped_small<1, 16>, ©_untyped_small<1, 32> | |
},{ | |
©_untyped_small<2, 1>, ©_untyped_small<2, 2>, | |
©_untyped_small<2, 4>, ©_untyped_small<2, 8>, | |
©_untyped_small<2, 16>, ©_untyped_small<2, 32> | |
},{ | |
©_untyped_small<4, 1>, ©_untyped_small<4, 2>, | |
©_untyped_small<4, 4>, ©_untyped_small<4, 8>, | |
©_untyped_small<4, 16>, ©_untyped_small<4, 32> | |
},{ | |
©_untyped_small<8, 1>, ©_untyped_small<8, 2>, | |
©_untyped_small<8, 4>, ©_untyped_small<8, 8>, | |
©_untyped_small<8, 16>, ©_untyped_small<8, 32> | |
},{ | |
©_untyped_small<16, 1>, ©_untyped_small<16, 2>, | |
©_untyped_small<16, 4>, ©_untyped_small<16, 8>, | |
©_untyped_small<16, 16>, ©_untyped_small<16, 32> | |
},{ | |
©_untyped_small<32, 1>, ©_untyped_small<32, 2>, | |
©_untyped_small<32, 4>, ©_untyped_small<32, 8>, | |
©_untyped_small<32, 16>, ©_untyped_small<32, 32> | |
} | |
}; | |
TABLE[src_alignment][dst_alignment](dst_, _src, _bytes); | |
} | |
#if defined(USE_SSE2) | |
// STREAMING implies DST_ALIGNMENT. Cannot use streaming store without aligned | |
// destination. | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, bool STREAMING> | |
RX_HINT_NO_INLINE static void copy_untyped_large_sse2(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) { | |
auto src = reinterpret_cast<const Byte*>(src_); | |
auto dst = reinterpret_cast<Byte*>(dst_); | |
Size bytes = bytes_; | |
_mm_prefetch(reinterpret_cast<const char*>(src), _MM_HINT_NTA); | |
for (; bytes >= 128; bytes -= 128) { | |
__m128i m0, m1, m2, m3, m4, m5, m6, m7; | |
if constexpr (SRC_ALIGNMENT % 16 == 0) { | |
m0 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 0); | |
m1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 1); | |
m2 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 2); | |
m3 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 3); | |
m4 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 4); | |
m5 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 5); | |
m6 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 6); | |
m7 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 7); | |
} else { | |
m0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0); | |
m1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1); | |
m2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2); | |
m3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3); | |
m4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4); | |
m5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5); | |
m6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6); | |
m7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7); | |
} | |
_mm_prefetch(reinterpret_cast<const char*>(src + 256), _MM_HINT_NTA); | |
src += 128; // This needs to be immediately after _mm_prefetch. | |
if constexpr (DST_ALIGNMENT % 16 == 0 && STREAMING) { | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 0, m0); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 1, m1); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 2, m2); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 3, m3); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 4, m4); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 5, m5); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 6, m6); | |
_mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 7, m7); | |
} else if constexpr (DST_ALIGNMENT % 16 == 0) { | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 0, m0); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 1, m1); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 2, m2); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 3, m3); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 4, m4); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 5, m5); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 6, m6); | |
_mm_store_si128(reinterpret_cast<__m128i*>(dst) + 7, m7); | |
} else { | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 0, m0); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 1, m1); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 2, m2); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 3, m3); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 4, m4); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 5, m5); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 6, m6); | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 7, m7); | |
} | |
dst += 128; | |
} | |
_mm_sfence(); | |
// Update out parameters. | |
src_ = src; | |
dst_ = dst; | |
bytes_ = bytes; | |
} | |
RX_HINT_FORCE_INLINE static void copy_untyped_large_sse2_dispatch(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) { | |
// The alignment should be 16 or 32, thus bit_search_lsb should yield an exponent of 4 or 5. | |
const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z); | |
const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(src_)), 5_z); | |
static constexpr void (*const TABLE[6][2][6])(Byte *RX_HINT_RESTRICT&, const Byte *RX_HINT_RESTRICT&, Size&) = { | |
{ | |
{ | |
©_untyped_large_sse2<1, 1, false>, ©_untyped_large_sse2<1, 2, false>, | |
©_untyped_large_sse2<1, 4, false>, ©_untyped_large_sse2<1, 8, false>, | |
©_untyped_large_sse2<1, 16, false>, ©_untyped_large_sse2<1, 32, false> | |
},{ | |
©_untyped_large_sse2<1, 1, true>, ©_untyped_large_sse2<1, 2, true>, | |
©_untyped_large_sse2<1, 4, true>, ©_untyped_large_sse2<1, 8, true>, | |
©_untyped_large_sse2<1, 16, true>, ©_untyped_large_sse2<1, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_sse2<2, 1, false>, ©_untyped_large_sse2<2, 2, false>, | |
©_untyped_large_sse2<2, 4, false>, ©_untyped_large_sse2<2, 8, false>, | |
©_untyped_large_sse2<2, 16, false>, ©_untyped_large_sse2<2, 32, false> | |
},{ | |
©_untyped_large_sse2<2, 1, false>, ©_untyped_large_sse2<2, 2, false>, | |
©_untyped_large_sse2<2, 4, false>, ©_untyped_large_sse2<2, 8, false>, | |
©_untyped_large_sse2<2, 16, false>, ©_untyped_large_sse2<2, 32, false> | |
} | |
},{ | |
{ | |
©_untyped_large_sse2<4, 1, false>, ©_untyped_large_sse2<4, 2, false>, | |
©_untyped_large_sse2<4, 4, false>, ©_untyped_large_sse2<4, 8, false>, | |
©_untyped_large_sse2<4, 16, false>, ©_untyped_large_sse2<4, 32, false> | |
},{ | |
©_untyped_large_sse2<4, 1, true>, ©_untyped_large_sse2<4, 2, true>, | |
©_untyped_large_sse2<4, 4, true>, ©_untyped_large_sse2<4, 8, true>, | |
©_untyped_large_sse2<4, 16, true>, ©_untyped_large_sse2<4, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_sse2<8, 1, false>, ©_untyped_large_sse2<8, 2, false>, | |
©_untyped_large_sse2<8, 4, false>, ©_untyped_large_sse2<8, 8, false>, | |
©_untyped_large_sse2<8, 16, false>, ©_untyped_large_sse2<8, 32, false> | |
},{ | |
©_untyped_large_sse2<8, 1, true>, ©_untyped_large_sse2<8, 2, true>, | |
©_untyped_large_sse2<8, 4, true>, ©_untyped_large_sse2<8, 8, true>, | |
©_untyped_large_sse2<8, 16, true>, ©_untyped_large_sse2<8, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_sse2<16, 1, false>, ©_untyped_large_sse2<16, 2, false>, | |
©_untyped_large_sse2<16, 4, false>, ©_untyped_large_sse2<16, 8, false>, | |
©_untyped_large_sse2<16, 16, false>, ©_untyped_large_sse2<16, 32, false> | |
},{ | |
©_untyped_large_sse2<16, 1, true>, ©_untyped_large_sse2<16, 2, true>, | |
©_untyped_large_sse2<16, 4, true>, ©_untyped_large_sse2<16, 8, true>, | |
©_untyped_large_sse2<16, 16, true>, ©_untyped_large_sse2<16, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_sse2<32, 1, false>, ©_untyped_large_sse2<32, 2, false>, | |
©_untyped_large_sse2<32, 4, false>, ©_untyped_large_sse2<32, 8, false>, | |
©_untyped_large_sse2<32, 16, false>, ©_untyped_large_sse2<32, 32, false> | |
},{ | |
©_untyped_large_sse2<32, 1, true>, ©_untyped_large_sse2<32, 2, true>, | |
©_untyped_large_sse2<32, 4, true>, ©_untyped_large_sse2<32, 8, true>, | |
©_untyped_large_sse2<32, 16, true>, ©_untyped_large_sse2<32, 32, true> | |
} | |
} | |
}; | |
// Medium sized copy up to L2 cache size to utilize prefetching bonus. | |
static constexpr const Size L2_CACHE_SIZE = 0x200000; // 2 MiB | |
TABLE[src_alignment][bytes_ > L2_CACHE_SIZE][dst_alignment](dst_, src_, bytes_); | |
} | |
#endif // defined(USE_SSE2) | |
#if defined(USE_AVX) | |
// STREAMING implies DST_ALIGNMENT. Cannot use streaming store without aligned | |
// destination. | |
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, bool STREAMING> | |
RX_HINT_NO_INLINE static void copy_untyped_large_avx(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) { | |
auto src = reinterpret_cast<const Byte*>(src_); | |
auto dst = reinterpret_cast<Byte*>(dst_); | |
Size bytes = bytes_; | |
_mm_prefetch(reinterpret_cast<const char*>(src), _MM_HINT_NTA); | |
for (; bytes >= 256; bytes -= 256) { | |
__m256i m0, m1, m2, m3, m4, m5, m6, m7; | |
if constexpr (SRC_ALIGNMENT % 32 == 0) { | |
m0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 0); | |
m1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 1); | |
m2 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 2); | |
m3 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 3); | |
m4 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 4); | |
m5 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 5); | |
m6 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 6); | |
m7 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 7); | |
} else { | |
m0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 0); | |
m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 1); | |
m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 2); | |
m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 3); | |
m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 4); | |
m5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 5); | |
m6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 6); | |
m7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 7); | |
} | |
_mm_prefetch(reinterpret_cast<const char*>(src + 512), _MM_HINT_NTA); | |
src += 256; // This needs to be immediately after _mm_prefetch. | |
if constexpr (DST_ALIGNMENT % 32 == 0 && STREAMING) { | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 0, m0); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 1, m1); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 2, m2); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 3, m3); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 4, m4); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 5, m5); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 6, m6); | |
_mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 7, m7); | |
} else if constexpr (DST_ALIGNMENT % 32 == 0) { | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 0, m0); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 1, m1); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 2, m2); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 3, m3); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 4, m4); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 5, m5); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 6, m6); | |
_mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 7, m7); | |
} else { | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 0, m0); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 1, m1); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 2, m2); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 3, m3); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 4, m4); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 5, m5); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 6, m6); | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 7, m7); | |
} | |
dst += 256; | |
} | |
_mm_sfence(); | |
// Update out parameters. | |
src_ = src; | |
dst_ = dst; | |
bytes_ = bytes; | |
} | |
RX_HINT_FORCE_INLINE static void copy_untyped_large_avx_dispatch(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) { | |
// The alignment should be 16 or 32, thus bit_search_lsb should yield an exponent of 4 or 5. | |
const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z); | |
const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(src_)), 5_z); | |
static constexpr void (*const TABLE[6][2][6])(Byte *RX_HINT_RESTRICT&, const Byte *RX_HINT_RESTRICT&, Size&) = { | |
{ | |
{ | |
©_untyped_large_avx<1, 1, false>, ©_untyped_large_avx<1, 2, false>, | |
©_untyped_large_avx<1, 4, false>, ©_untyped_large_avx<1, 8, false>, | |
©_untyped_large_avx<1, 16, false>, ©_untyped_large_avx<1, 32, false> | |
},{ | |
©_untyped_large_avx<1, 1, true>, ©_untyped_large_avx<1, 2, true>, | |
©_untyped_large_avx<1, 4, true>, ©_untyped_large_avx<1, 8, true>, | |
©_untyped_large_avx<1, 16, true>, ©_untyped_large_avx<1, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_avx<2, 1, false>, ©_untyped_large_avx<2, 2, false>, | |
©_untyped_large_avx<2, 4, false>, ©_untyped_large_avx<2, 8, false>, | |
©_untyped_large_avx<2, 16, false>, ©_untyped_large_avx<2, 32, false> | |
},{ | |
©_untyped_large_avx<2, 1, false>, ©_untyped_large_avx<2, 2, false>, | |
©_untyped_large_avx<2, 4, false>, ©_untyped_large_avx<2, 8, false>, | |
©_untyped_large_avx<2, 16, false>, ©_untyped_large_avx<2, 32, false> | |
} | |
},{ | |
{ | |
©_untyped_large_avx<4, 1, false>, ©_untyped_large_avx<4, 2, false>, | |
©_untyped_large_avx<4, 4, false>, ©_untyped_large_avx<4, 8, false>, | |
©_untyped_large_avx<4, 16, false>, ©_untyped_large_avx<4, 32, false> | |
},{ | |
©_untyped_large_avx<4, 1, true>, ©_untyped_large_avx<4, 2, true>, | |
©_untyped_large_avx<4, 4, true>, ©_untyped_large_avx<4, 8, true>, | |
©_untyped_large_avx<4, 16, true>, ©_untyped_large_avx<4, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_avx<8, 1, false>, ©_untyped_large_avx<8, 2, false>, | |
©_untyped_large_avx<8, 4, false>, ©_untyped_large_avx<8, 8, false>, | |
©_untyped_large_avx<8, 16, false>, ©_untyped_large_avx<8, 32, false> | |
},{ | |
©_untyped_large_avx<8, 1, true>, ©_untyped_large_avx<8, 2, true>, | |
©_untyped_large_avx<8, 4, true>, ©_untyped_large_avx<8, 8, true>, | |
©_untyped_large_avx<8, 16, true>, ©_untyped_large_avx<8, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_avx<16, 1, false>, ©_untyped_large_avx<16, 2, false>, | |
©_untyped_large_avx<16, 4, false>, ©_untyped_large_avx<16, 8, false>, | |
©_untyped_large_avx<16, 16, false>, ©_untyped_large_avx<16, 32, false> | |
},{ | |
©_untyped_large_avx<16, 1, true>, ©_untyped_large_avx<16, 2, true>, | |
©_untyped_large_avx<16, 4, true>, ©_untyped_large_avx<16, 8, true>, | |
©_untyped_large_avx<16, 16, true>, ©_untyped_large_avx<16, 32, true> | |
} | |
},{ | |
{ | |
©_untyped_large_avx<32, 1, false>, ©_untyped_large_avx<32, 2, false>, | |
©_untyped_large_avx<32, 4, false>, ©_untyped_large_avx<32, 8, false>, | |
©_untyped_large_avx<32, 16, false>, ©_untyped_large_avx<32, 32, false> | |
},{ | |
©_untyped_large_avx<32, 1, true>, ©_untyped_large_avx<32, 2, true>, | |
©_untyped_large_avx<32, 4, true>, ©_untyped_large_avx<32, 8, true>, | |
©_untyped_large_avx<32, 16, true>, ©_untyped_large_avx<32, 32, true> | |
} | |
} | |
}; | |
// Medium sized copy up to L2 cache size to utilize prefetching bonus. | |
static constexpr const Size L2_CACHE_SIZE = 0x200000; // 2 MiB | |
TABLE[src_alignment][bytes_ > L2_CACHE_SIZE][dst_alignment](dst_, src_, bytes_); | |
} | |
#endif // defined(USE_AVX) | |
void* copy_untyped(void *RX_HINT_RESTRICT dst_, const void *RX_HINT_RESTRICT _src, Size _bytes) { | |
auto dst = reinterpret_cast<Byte*>(dst_); | |
auto src = reinterpret_cast<const Byte*>(_src); | |
// Help check for undefined calls to memcpy. | |
RX_ASSERT(dst, "null destination"); | |
RX_ASSERT(src, "null source"); | |
if (_bytes <= SMALL_SIZE) { | |
copy_untyped_small_dispatch(dst, src, _bytes); | |
return dst_; | |
} | |
#if defined(USE_AVX) | |
// Copies larger than SMALL_SIZE bytes, process with AVX if available. | |
const auto padding = (32 - (reinterpret_cast<UintPtr>(dst) & 31)) & 31; | |
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src))); | |
dst += padding; | |
src += padding; | |
_bytes -= padding; | |
copy_untyped_large_avx_dispatch(dst, src, _bytes); | |
copy_untyped_small_dispatch(dst, src, _bytes); | |
#elif defined(USE_SSE2) | |
// Copies larger than SMALL_SIZE bytes, process with SSE2 if available. | |
// Align destination to 16-byte boundary. | |
const auto padding = (16 - (reinterpret_cast<UintPtr>(dst) & 15)) & 15; | |
// Handle padding bytes first, if any. | |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))); | |
dst += padding; | |
src += padding; | |
_bytes -= padding; | |
copy_untyped_large_sse2_dispatch(dst, src, _bytes); | |
copy_untyped_small_dispatch(dst, src, _bytes); | |
#else | |
#if defined(RX_BYTE_ORDER_LITTLE_ENDIAN) | |
#define LS(x, y) ((x) >> (y)) | |
#define RS(x, y) ((x) << (y)) | |
#else | |
#define LS(x, y) ((x) << (y)) | |
#define RS(x, y) ((x) >> (y)) | |
#endif | |
// Align to 4-byte boundary. 8-byte boundary was also tested but ended up being | |
// about 0.33x slower on average across AARCH64, AMD64, and WASM32. | |
for (; reinterpret_cast<UintPtr>(src) % 4 && _bytes; _bytes--) { | |
*dst++ = *src++; | |
} | |
// The source and destination can be up to 4-byte aligned, but it would also | |
// be helpful to use 8-byte load and store on 64-bit architectures that | |
// support unaligned scalar load and stores. This helps AARCH64 quite a bit. | |
// Do not do this for 32-bit architectures though | |
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD) && defined(ALLOW_UNALIGNED_SCALAR_STORE) | |
static inline constexpr const bool ALLOW_64_UNALIGNED = sizeof(void*) == 8; | |
#else | |
static inline constexpr const bool ALLOW_64_UNALIGNED = false; | |
#endif | |
// Destination is aligned on 4-byte boundary. | |
if (reinterpret_cast<UintPtr>(dst) % 4 == 0) { | |
for (; _bytes >= 16; src += 16, dst += 16, _bytes -= 16) { | |
if constexpr (ALLOW_64_UNALIGNED) { | |
storea64(dst + 0, loada64(src + 0)); | |
storea64(dst + 8, loada64(src + 8)); | |
} else { | |
storea32(dst + 0, loada32(src + 0)); | |
storea32(dst + 4, loada32(src + 4)); | |
storea32(dst + 8, loada32(src + 8)); | |
storea32(dst + 12, loada32(src + 12)); | |
} | |
} | |
if (_bytes & 8) { | |
if constexpr (ALLOW_64_UNALIGNED) { | |
storea64(dst + 0, loada64(src + 0)); | |
} else { | |
storea32(dst + 0, loada32(src + 0)); | |
storea32(dst + 4, loada32(src + 4)); | |
} | |
dst += 8; | |
src += 8; | |
} | |
if (_bytes & 4) { | |
storea32(dst + 0, loada32(src + 0)); | |
dst += 4; | |
src += 4; | |
} | |
if (_bytes & 2) { | |
storea16(dst + 0, loada16(src + 0)); | |
dst += 2; | |
src += 2; | |
} | |
if (_bytes & 1) { | |
*dst = *src; | |
} | |
return dst_; | |
} | |
// Handle destination misalignment. | |
Uint32 w, x; | |
if (_bytes >= 32) switch (reinterpret_cast<UintPtr>(dst) % 4) { | |
case 1: | |
w = loadu32(src); | |
*dst++ = *src++; | |
*dst++ = *src++; | |
*dst++ = *src++; | |
_bytes -= 3; | |
for (; _bytes >= 17; src += 16, dst += 16, _bytes -= 16) { | |
x = loada32(src + 1); | |
storea32(dst + 0, LS(w, 24) | RS(x, 8)); | |
w = loada32(src + 5); | |
storea32(dst + 4, LS(x, 24) | RS(w, 8)); | |
x = loada32(src + 9); | |
storea32(dst + 8, LS(w, 24) | RS(x, 8)); | |
w = loada32(src + 13); | |
storea32(dst + 12, LS(x, 24) | RS(w, 8)); | |
} | |
break; | |
case 2: | |
w = loadu32(src); | |
*dst++ = *src++; | |
*dst++ = *src++; | |
_bytes -= 2; | |
for (; _bytes >= 18; src += 16, dst += 16, _bytes -= 16) { | |
x = loada32(src + 2); | |
storea32(dst + 0, LS(w, 16) | RS(x, 16)); | |
w = loada32(src + 6); | |
storea32(dst + 4, LS(x, 16) | RS(w, 16)); | |
x = loada32(src + 10); | |
storea32(dst + 8, LS(w, 16) | RS(x, 16)); | |
w = loada32(src + 14); | |
storea32(dst + 12, LS(x, 16) | RS(w, 16)); | |
} | |
break; | |
case 3: | |
w = loadu32(src); | |
*dst++ = *src++; | |
_bytes -= 1; | |
for (; _bytes >= 19; src += 16, dst += 16, _bytes -= 16) { | |
x = loada32(src + 3); | |
storea32(dst + 0, LS(w, 8) | RS(x, 24)); | |
w = loada32(src + 7); | |
storea32(dst + 4, LS(x, 8) | RS(w, 24)); | |
x = loada32(src + 11); | |
storea32(dst + 8, LS(w, 8) | RS(x, 24)); | |
w = loada32(src + 15); | |
storea32(dst + 12, LS(x, 8) | RS(w, 24)); | |
} | |
break; | |
} | |
if (_bytes & 16) { | |
if constexpr (ALLOW_64_UNALIGNED) { | |
storea64(dst + 0, loada64(src + 0)); | |
storea64(dst + 8, loada64(src + 8)); | |
} else { | |
storea32(dst + 0, loada32(src + 0)); | |
storea32(dst + 4, loada32(src + 4)); | |
storea32(dst + 8, loada32(src + 8)); | |
storea32(dst + 12, loada32(src + 12)); | |
} | |
src += 16; | |
dst += 16; | |
} | |
if (_bytes & 8) { | |
if constexpr (ALLOW_64_UNALIGNED) { | |
storea64(dst + 0, loada64(src + 0)); | |
} else { | |
storea32(dst + 0, loada64(src + 0)); | |
storea32(dst + 4, loada64(src + 4)); | |
} | |
src += 8; | |
dst += 8; | |
} | |
if (_bytes & 4) { | |
storea32(dst + 0, loada64(src + 0)); | |
src += 4; | |
dst += 4; | |
} | |
if (_bytes & 2) { | |
storea16(dst + 0, loada16(src + 0)); | |
src += 2; | |
dst += 2; | |
} | |
if (_bytes & 1) { | |
*dst = *src; | |
} | |
#endif | |
return dst_; | |
} | |
} // namespace Rx::Memory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment