graphitemaster/memcpy.cpp Secret

## memcpy.cpp
// The following is an extremely optimized memcpy with several optimizations.
//
// * SSE2 for 16-byte copies, up to 128-byte unrolled. Blocks larger than
//   128-bytes are processed 128-bytes at a time. Copies larger than L2 cache
//   size use streaming stores to avoid polluting-cache. Prefetching hints are
//   used for large copies, prefetching up to 512-bytes at a time.
//
// * AVX for 32-byte copies, up to 256-byte unrolled. Blocks larger than
//   256-bytes are processed 256-bytes at a time. Copies larger than L2 cache
//   size use streaming stores to avoid polluting-cache. Prefetching hints are
//   used for large copies, prefetching up to 256-bytes at a time.
//
// * Scalar load and store up to 4-byte on 32-bit and 8-byte on 64-bit are used
//   for general non-vector copies, and remainder bytes, unrolled to 16-bytes at
//   a time on 64-bit systems, 8-bytes on 32-bit systems.
//
// * Where possible, aligned load and stores are used by both scalar and vector
//   routes, of every possible sized copy, leading to combitronic explosion of
//   code size as everything is heavily specialized by both source and
//   destination alignment, from as high as 256-byte alignment (for AVX) down to
//   2-byte alignment (Uint16 scalar).
//
// * Where possible, on systems that allow unaligned load and store of words,
//   unaligned routines here use that to avoid generating words with shifts and
//   masks. Shifts and masks are generally only faster on those systems if
//   the have barrel shifter hardware, which don't exist.
//
// * Forcing of inlining is used aggresively to get serial sleds of move
//   instructions avoiding jump instructions as much as possible, especially
//   within massive dispatch switch tables. Similarly large kernel functions
//   which exchaust all vector registers are explicity marked to prevent
//   inlining so that call instructions are always generated instead, as
//   inlining leads to a lot of register spilling, ruining the kernel's
//   performance.
//
// * Tables for function dispatch are used instead of a bunch of conditional
//   branching to make use of conditional move instructions on systems which
//   support it, and because it keeps a lot of cold code out of the dispatch.
//
// * Unaligned destination for scalar code is handled by performing a very small
//   unaligned byte copy until alignment, tracking the unaligned byte(s) in an
//   an appropriate-sized word for word-copies, interleaving shifts and adds
//   for those bytes. This optimization turns out to pipeline well for scalar
//   out-of-order processor, but applying the same optimization for vector code
//   is worse as it involves a lot of pack and unpack instructions which
//   penalize vector pipelining, so unaligned destination is done with unaligned
//   vector instructions instead, with the 15-byte remainder done with scalar
//   copies instead.
//
// * Alignment of source and destination pointers are calculated by counting
//   the number of zero bits in the pointer, the count is log2(alignment), this
//   index is used to dispatch alignment function tables. Computing this is
//   possible with native popcount instructions, on systems that lack such
//   instructions, it becomes a very fast constant time look-up within a
//   De Bruijn sequence table and constant multiplier, rather than using
//   conditional branches or moves.
//
// * On modern x86 the use of "rep movsb" isfaster in small to, medium-sized
//   copies. Large copies it tends to trash L2 cache and is slower than SSE2 or
//   AVX with streaming store. The unrolled scalar 16-byte => .... => 2-byte
//   copy optimizes to "rep movsb" in gcc and clang at all optimization levels,
//   (including debug) when conditionally guarded for N <= 32 which it is, so
//   the optimization is not explicitly done here.
//
// * Lot of strict-aliasing is violated here to allow for word-copies, this
//   strictly invokes undefined-behavior, however with the use of compiler
//   intrinsics it's possible to make it implementation-defined behavior. This
//   is done aggressively and carefully to avoid strict-aliasing optimizations
//   from miscompiling this code. Look for RX_HINT_MAY_ALIAS.
//
// * Since copies cannot overlap (that's what "memmove" is for), restrict
//   pointers are used aggressively as well to avoid unnecessary generation of
//   additional load instructions in the scalar code.
#include "rx/core/assert.h"             // RX_ASSERT
#include "rx/core/config.h"             // RX_{ARCHITECTURE,COMPILLER}_*
#include "rx/core/hints/force_inline.h" // RX_HINT_FORCE_INLINE
#include "rx/core/hints/no_inline.h"    // RX_HINT_NO_INLINE
#include "rx/core/hints/may_alias.h"    // RX_HINT_MAY_ALIAS
#include "rx/core/hints/restrict.h"     // RX_HINT_RESTRICT
#include "rx/core/utility/bit.h"        // bit_search_lsb
#include "rx/core/algorithm/min.h"      // Algorithm::min

// We do not support x86 without SSE2 support.
#if defined(RX_ARCHITECTURE_AMD64) || defined(RX_ARCHITECTURE_X86)
#define USE_SSE2
// #define USE_AVX
#define ALLOW_UNALIGNED_SCALAR_LOAD
#define ALLOW_UNALIGNED_SCALAR_STORE
#elif defined(RX_ARCHITECTURE_AARCH64)
#define ALLOW_UNALIGNED_SCALAR_LOAD
#define ALLOW_UNALIGNED_SCALAR_STORE
#elif defined(RX_ARCHITECTURE_WASM32)
// Emscripten supports SSE2 on WASM32, but WASM32 is extremely slow when you
// use unaligned scalar load and stores since it's implemented in browsers with
// a typed array and sveral typed arrays views which causes tier-up to generate
// a ton of shifts and masking rather, so try to avoid that.
#define USE_SSE2
#endif

#if defined(USE_SSE2)
#include <emmintrin.h>
#endif // defined(USE_SSE2)

#if defined(USE_AVX)
#include <immintrin.h>
#endif // defined(USE_AVX)

namespace Rx::Memory {

// The unrolled small copy size to use depending on what features are selected.
#if defined(USE_AVX)
static inline constexpr const Size SMALL_SIZE = 256;
#elif defined(USE_SSE2)
static inline constexpr const Size SMALL_SIZE = 128;
#else
static inline constexpr const Size SMALL_SIZE = 64;
#endif

RX_HINT_FORCE_INLINE static Uint16 loada16(const Byte *RX_HINT_RESTRICT _src) {
  typedef Uint16 RX_HINT_MAY_ALIAS Word;
  return *reinterpret_cast<const Word*>(_src);
}

RX_HINT_FORCE_INLINE static Uint32 loada32(const Byte* RX_HINT_RESTRICT _src) {
  typedef Uint32 RX_HINT_MAY_ALIAS Word;
  return *reinterpret_cast<const Word*>(_src);
}

RX_HINT_FORCE_INLINE static Uint64 loada64(const Byte* RX_HINT_RESTRICT _src) {
  typedef Uint64 RX_HINT_MAY_ALIAS Word;
  return *reinterpret_cast<const Word*>(_src);
}

// Helper functiions to load unaligned quantities.
RX_HINT_FORCE_INLINE static Uint16 loadu16(const Byte *RX_HINT_RESTRICT _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD)
  return loada16(_src);
#else
  return (Uint16(_src[0]) << 8) | Uint16(_src[1]);
#endif  // defined(ALLOW_UNALIGNED_SCALAR_LOAD)
}

RX_HINT_FORCE_INLINE static Uint32 loadu32(const Byte* RX_HINT_RESTRICT _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD)
  return loada32(_src);
#else
  return (Uint32(_src[0]) << 24) | (Uint32(_src[1]) << 16)
       | (Uint32(_src[2]) << 8) | Uint32(_src[3]);
#endif  // defined(ALLOW_UNALIGNED_SCALAR_LOAD)
}

RX_HINT_FORCE_INLINE static Uint64 loadu64(const Byte* RX_HINT_RESTRICT _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD)
  return loada64(_src);
#else
  return (Uint64(_src[0]) << 56) | (Uint64(_src[1]) << 48)
       | (Uint64(_src[2]) << 40) | (Uint64(_src[3]) << 32)
       | (Uint64(_src[4]) << 24) | (Uint64(_src[5]) << 16)
       | (Uint64(_src[6]) << 8) | Uint64(_src[7]);
#endif  // defined(ALLOW_UNALIGNED_SCALAR_LOAD)
}

RX_HINT_FORCE_INLINE static void storea16(Byte *RX_HINT_RESTRICT dst_, Uint16 _src) {
  typedef Uint16 RX_HINT_MAY_ALIAS Word;
  *reinterpret_cast<Word*>(dst_) = _src;
}

RX_HINT_FORCE_INLINE static void storea32(Byte *RX_HINT_RESTRICT dst_, Uint32 _src) {
  typedef Uint32 RX_HINT_MAY_ALIAS Word;
  *reinterpret_cast<Word*>(dst_) = _src;
}

RX_HINT_FORCE_INLINE static void storea64(Byte *RX_HINT_RESTRICT dst_, Uint64 _src) {
  typedef Uint64 RX_HINT_MAY_ALIAS Word;
  *reinterpret_cast<Word*>(dst_) = _src;
}

RX_HINT_FORCE_INLINE static void storeu16(Byte *RX_HINT_RESTRICT dst_, Uint16 _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_STORE)
  storea16(dst_, _src);
#else
  const auto src = reinterpret_cast<const Byte*>(&_src);
  dst_[0] = src[0];
  dst_[1] = src[0];
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE)
}

RX_HINT_FORCE_INLINE static void storeu32(Byte *RX_HINT_RESTRICT dst_, Uint32 _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_STORE)
  storea32(dst_, _src);
#else
  const auto src = reinterpret_cast<const Byte*>(&_src);
  dst_[0] = src[0];
  dst_[1] = src[1];
  dst_[2] = src[2];
  dst_[3] = src[3];
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE)
}

RX_HINT_FORCE_INLINE static void storeu64(Byte *RX_HINT_RESTRICT dst_, Uint64 _src) {
#if defined(ALLOW_UNALIGNED_SCALAR_STORE)
  storea64(dst_, _src);
#else
  const auto src = reinterpret_cast<const Byte*>(&_src);
  dst_[0] = src[0];
  dst_[1] = src[1];
  dst_[2] = src[2];
  dst_[3] = src[3];
  dst_[4] = src[4];
  dst_[5] = src[5];
  dst_[6] = src[6];
  dst_[7] = src[7];
#endif // defined(ALLOW_UNALIGNED_SCALAR_STORE)
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, Size E>
RX_HINT_FORCE_INLINE static void copy_untyped_scalar(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
  static_assert(E <= 8, "too large");
  switch (E) {
  case 8: // 8
    // On 32-bit systems the use of 8-byte load and store tends to be slower,
    // use 32-bit load store here. This optimization cannot be done inside
    // {store,load}{a,u}64 since those operate with words and would involve
    // shifts and masking which would produce worse code.
    //
    // This is a significant performance improvement for WASM32.
    if constexpr (sizeof(void*) == 8) {
      if constexpr (SRC_ALIGNMENT % 8 == 0) {
        if constexpr (DST_ALIGNMENT % 8 == 0) {
          storea64(dst_ + 0, loada64(_src + 0));
        } else {
          storeu64(dst_ + 0, loada64(_src + 0));
        }
      } else {
        if constexpr (DST_ALIGNMENT % 8 == 0) {
          storea64(dst_ + 0, loadu64(_src + 0));
        } else {
          storeu64(dst_ + 0, loadu64(_src + 0));
        }
      }
    } else {
      if constexpr (SRC_ALIGNMENT % 4 == 0) {
        if constexpr (DST_ALIGNMENT % 4 == 0) {
          storea32(dst_ + 0, loada32(_src + 0));
          storea32(dst_ + 4, loada32(_src + 4));
        } else {
          storeu32(dst_ + 0, loada32(_src + 0));
          storeu32(dst_ + 4, loada32(_src + 4));
        }
      } else {
        if constexpr (DST_ALIGNMENT % 4 == 0) {
          storea32(dst_ + 0, loadu32(_src + 0));
          storea32(dst_ + 4, loadu32(_src + 4));
        } else {
          storeu32(dst_ + 0, loadu32(_src + 0));
          storeu32(dst_ + 4, loadu32(_src + 4));
        }
      }
    }
    break;
  case 7: // 4 + 2 + 1
    dst_[6] = _src[6];
    [[fallthrough]];
  case 6: // 4 + 2
    if constexpr (SRC_ALIGNMENT % 4 == 0) {
      if constexpr (DST_ALIGNMENT % 4 == 0) {
        storea32(dst_ + 0, loada32(_src + 0));
        storea16(dst_ + 4, loada16(_src + 4));
      } else {
        storeu32(dst_ + 0, loada32(_src + 0));
        storeu16(dst_ + 4, loada16(_src + 4));
      }
    } else {
      if constexpr (SRC_ALIGNMENT % 4 == 0) {
        storea32(dst_ + 0, loadu32(_src + 0));
        storea16(dst_ + 4, loadu16(_src + 4));
      } else {
        storeu32(dst_ + 0, loadu32(_src + 0));
        storeu16(dst_ + 4, loadu16(_src + 4));
      }
    }
    break;
  case 5: // 4 + 1
    dst_[4] = _src[4];
    [[fallthrough]];
  case 4: // 4
    if constexpr (SRC_ALIGNMENT % 4 == 0) {
      if constexpr (DST_ALIGNMENT % 4 == 0) {
        storea32(dst_ + 0, loada32(_src + 0));
      } else {
        storeu32(dst_ + 0, loada32(_src + 0));
      }
    } else {
      if constexpr (DST_ALIGNMENT % 4 == 0) {
        storea32(dst_ + 0, loadu32(_src + 0));
      } else {
        storeu32(dst_ + 0, loadu32(_src + 0));
      }
    }
    break;
  case 3: // 2 + 1
    dst_[2] = _src[2];
    [[fallthrough]];
  case 2: // 2
    if constexpr (SRC_ALIGNMENT % 2 == 0) {
      if constexpr (DST_ALIGNMENT % 2 == 0) {
        storea16(dst_ + 0, loada16(_src + 0));
      } else {
        storeu16(dst_ + 0, loada16(_src + 0));
      }
    } else {
      if constexpr (DST_ALIGNMENT % 2 == 0) {
        storea16(dst_ + 0, loadu16(_src + 0));
      } else {
        storeu16(dst_ + 0, loadu16(_src + 0));
      }
    }
    break;
  case 1:
    dst_[0] = _src[0];
    break;
  }
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, Size E>
RX_HINT_FORCE_INLINE static void copy_untyped_scalar_small(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
  if constexpr (E <= 8) {
    return copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, E>(dst_, _src);
  } else switch (E) {
  // Handle copies from size [9, 16] bytes with this.
  case 16: // 8 + 8
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0,  _src + 0);
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 8,  _src + 8);
    break;
  case 15: // 8 + 4 + 2 + 1
    dst_[14] = _src[14];
    [[fallthrough]];
  case 14: // 8 + 4 + 2
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0,  _src + 0);
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 4>(dst_ + 8,  _src + 8);
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 2>(dst_ + 12, _src + 12);
    break;
  case 13: // 8 + 4 + 1
    dst_[12] = _src[12];
    [[fallthrough]];
  case 12: // 8 + 4
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0,  _src + 0);
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 4>(dst_ + 8,  _src + 8);
    break;
  case 11: // 8 + 2 + 1
    dst_[10] = _src[10];
    [[fallthrough]];
  case 10: // 8 + 2
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0);
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 2>(dst_ + 8, _src + 8);
    break;
  case 9:
    copy_untyped_scalar<SRC_ALIGNMENT, DST_ALIGNMENT, 8>(dst_ + 0, _src + 0);
    dst_[8] = _src[8];
    break;
  }
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_vector_16(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
#if defined(USE_SSE2)
  if constexpr (SRC_ALIGNMENT % 16 == 0) {
    if constexpr (DST_ALIGNMENT % 16 == 0) {
      _mm_store_si128(reinterpret_cast<__m128i*>(dst_), _mm_load_si128(reinterpret_cast<const __m128i*>(_src)));
    } else {
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_), _mm_load_si128(reinterpret_cast<const __m128i*>(_src)));
    }
  } else {
    if constexpr (DST_ALIGNMENT % 16 == 0) {
      _mm_store_si128(reinterpret_cast<__m128i*>(dst_), _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src)));
    } else {
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_), _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src)));
    }
  }
#else
  copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 16>(dst_, _src);
#endif
}

// Do not inline these.
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_vector_32(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
#if defined(USE_AVX)
  if constexpr (SRC_ALIGNMENT % 32 == 0) {
    if constexpr (DST_ALIGNMENT % 32 == 0) {
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst_), _mm256_load_si256(reinterpret_cast<const __m256i*>(_src)));
    } else {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_), _mm256_load_si256(reinterpret_cast<const __m256i*>(_src)));
    }
  } else {
    if constexpr (DST_ALIGNMENT % 32 == 0) {
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst_), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_src)));
    } else {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst_), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_src)));
    }
  }
#else
  copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
  copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 16, _src + 16);
#endif
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_vector_64(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
  copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
  copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 32, _src + 32);
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_vector_128(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
  copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
  copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64);
}

template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_vector_256(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src) {
  copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,   _src + 0);
  copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
}

// To reduce code size depending on SIMD selection, only emit a dispatch here for
// sizes up to 64 in SCALAR mode, 128 in SSE2 mode, and 256 in AVX mode.
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT>
RX_HINT_FORCE_INLINE static void copy_untyped_small(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src, Size _bytes) {
  // Handle up to 15 tail bytes with scalar small copies until next multiple for
  // vectorization. Each case here shares a common vector copy up to the tail,
  // it's done first since the order tends to matter for better optimization.
#define CASE(index, offset, ...) \
  case index - 0:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 15>(dst_ + offset, _src + offset); break; \
  case index - 1:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 14>(dst_ + offset, _src + offset); break; \
  case index - 2:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 13>(dst_ + offset, _src + offset); break; \
  case index - 3:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 12>(dst_ + offset, _src + offset); break; \
  case index - 4:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 11>(dst_ + offset, _src + offset); break; \
  case index - 5:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT, 10>(dst_ + offset, _src + offset); break; \
  case index - 6:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  9>(dst_ + offset, _src + offset); break; \
  case index - 7:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  8>(dst_ + offset, _src + offset); break; \
  case index - 8:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  7>(dst_ + offset, _src + offset); break; \
  case index - 9:  __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  6>(dst_ + offset, _src + offset); break; \
  case index - 10: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  5>(dst_ + offset, _src + offset); break; \
  case index - 11: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  4>(dst_ + offset, _src + offset); break; \
  case index - 12: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  3>(dst_ + offset, _src + offset); break; \
  case index - 13: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  2>(dst_ + offset, _src + offset); break; \
  case index - 14: __VA_ARGS__ copy_untyped_scalar_small<SRC_ALIGNMENT, DST_ALIGNMENT,  1>(dst_ + offset, _src + offset); break; \
  case offset:     __VA_ARGS__ break

  switch (_bytes) {
#if defined(USE_AVX)
  case 256:
    copy_untyped_vector_256<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    break;
  CASE(255, 240, {
    // 128 + 64 + 32 + 16
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 224, _src + 224);
  });
  CASE(239, 224, {
    // 128 + 64 + 32
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192);
  });
  CASE(223, 208, {
    // 128 + 64 + 16
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 192, _src + 192);
  });
  CASE(207, 192, {
    // 128 + 64
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
  });
  CASE(191, 176, {
    // 128 + 32 + 16
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 160, _src + 160);
  });
  CASE(175, 160, {
    // 128 + 32
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
  });
  CASE(159, 144, {
    // 128 + 16
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 128, _src + 128);
  });
  CASE(143, 128, {
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
  });
#endif // defined(USE_AVX)
#if defined(USE_SSE2)
  // AVX handles cases 148 down to 128, when AVX is disabled we need to have
  // 128th case here.
#if !defined(USE_AVX)
  case 128:
    copy_untyped_vector_128<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    break;
#endif // !defined(USE_AVX)
  CASE(127, 112, {
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 96, _src + 96);
  });
  CASE(111, 96, {
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64);
  });
  CASE(95, 80, {
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 64, _src + 64);
  });
  CASE(79, 64, {
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
  });
#endif // defined(USE_SSE2)
  // SSE2 handles cases 79 down to 64, when SSE2 is disabled we need to have
  // a lone 64th case here manually.
#if !defined(USE_SSE2)
  case 64:
    copy_untyped_vector_64<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
    break;
#endif // !defined(USE_SSE2)
  CASE(63, 48, {
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0,  _src + 0);
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 32, _src + 32);
  });
  CASE(47, 32, {
    copy_untyped_vector_32<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
  });
  CASE(31, 16, {
    copy_untyped_vector_16<SRC_ALIGNMENT, DST_ALIGNMENT>(dst_ + 0, _src + 0);
  });
  CASE(15, 0, {});
  }
}

RX_HINT_FORCE_INLINE static void copy_untyped_small_dispatch(Byte *RX_HINT_RESTRICT dst_, const Byte *RX_HINT_RESTRICT _src, Size _bytes) {
  // Count the number of trailing zero bits to determine alignment of pointer. What ever this
  // number is raised to a power of two is the alignment of the pointer.
  const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z);
  const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(_src)), 5_z);
  // TABLE[src_alignment][dst_alignment]
  static constexpr void (*const TABLE[6][6])(Byte *RX_HINT_RESTRICT, const Byte *RX_HINT_RESTRICT, Size) = {
    {
      &copy_untyped_small<1, 1>,   &copy_untyped_small<1, 2>,
      &copy_untyped_small<1, 4>,   &copy_untyped_small<1, 8>,
      &copy_untyped_small<1, 16>,  &copy_untyped_small<1, 32>
    },{
      &copy_untyped_small<2, 1>,   &copy_untyped_small<2, 2>,
      &copy_untyped_small<2, 4>,   &copy_untyped_small<2, 8>,
      &copy_untyped_small<2, 16>,  &copy_untyped_small<2, 32>
    },{
      &copy_untyped_small<4, 1>,   &copy_untyped_small<4, 2>,
      &copy_untyped_small<4, 4>,   &copy_untyped_small<4, 8>,
      &copy_untyped_small<4, 16>,  &copy_untyped_small<4, 32>
    },{
      &copy_untyped_small<8, 1>,   &copy_untyped_small<8, 2>,
      &copy_untyped_small<8, 4>,   &copy_untyped_small<8, 8>,
      &copy_untyped_small<8, 16>,  &copy_untyped_small<8, 32>
    },{
      &copy_untyped_small<16, 1>,  &copy_untyped_small<16, 2>,
      &copy_untyped_small<16, 4>,  &copy_untyped_small<16, 8>,
      &copy_untyped_small<16, 16>, &copy_untyped_small<16, 32>
    },{
      &copy_untyped_small<32, 1>,  &copy_untyped_small<32, 2>,
      &copy_untyped_small<32, 4>,  &copy_untyped_small<32, 8>,
      &copy_untyped_small<32, 16>, &copy_untyped_small<32, 32>
    }
  };
  TABLE[src_alignment][dst_alignment](dst_, _src, _bytes);
}

#if defined(USE_SSE2)
// STREAMING implies DST_ALIGNMENT. Cannot use streaming store without aligned
// destination.
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, bool STREAMING>
RX_HINT_NO_INLINE static void copy_untyped_large_sse2(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) {
  auto src = reinterpret_cast<const Byte*>(src_);
  auto dst = reinterpret_cast<Byte*>(dst_);

  Size bytes = bytes_;

  _mm_prefetch(reinterpret_cast<const char*>(src), _MM_HINT_NTA);

  for (; bytes >= 128; bytes -= 128) {
    __m128i m0, m1, m2, m3, m4, m5, m6, m7;

    if constexpr (SRC_ALIGNMENT % 16 == 0) {
      m0 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 0);
      m1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 1);
      m2 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 2);
      m3 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 3);
      m4 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 4);
      m5 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 5);
      m6 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 6);
      m7 = _mm_load_si128(reinterpret_cast<const __m128i*>(src) + 7);
    } else {
      m0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
      m1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
      m2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
      m3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
      m4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
      m5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
      m6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
      m7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
    }

    _mm_prefetch(reinterpret_cast<const char*>(src + 256), _MM_HINT_NTA);
    src += 128; // This needs to be immediately after _mm_prefetch.

    if constexpr (DST_ALIGNMENT % 16 == 0 && STREAMING) {
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 0, m0);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 1, m1);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 2, m2);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 3, m3);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 4, m4);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 5, m5);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 6, m6);
      _mm_stream_si128(reinterpret_cast<__m128i*>(dst) + 7, m7);
    } else if constexpr (DST_ALIGNMENT % 16 == 0) {
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 0, m0);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 1, m1);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 2, m2);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 3, m3);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 4, m4);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 5, m5);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 6, m6);
      _mm_store_si128(reinterpret_cast<__m128i*>(dst) + 7, m7);
    } else {
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 0, m0);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 1, m1);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 2, m2);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 3, m3);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 4, m4);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 5, m5);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 6, m6);
      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst) + 7, m7);
    }

    dst += 128;
  }

  _mm_sfence();

  // Update out parameters.
  src_ = src;
  dst_ = dst;
  bytes_ = bytes;
}

RX_HINT_FORCE_INLINE static void copy_untyped_large_sse2_dispatch(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) {
  // The alignment should be 16 or 32, thus bit_search_lsb should yield an exponent of 4 or 5.
  const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z);
  const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(src_)), 5_z);
  static constexpr void (*const TABLE[6][2][6])(Byte *RX_HINT_RESTRICT&, const Byte *RX_HINT_RESTRICT&, Size&) = {
    {
      {
        &copy_untyped_large_sse2<1, 1,  false>,  &copy_untyped_large_sse2<1, 2,  false>,
        &copy_untyped_large_sse2<1, 4,  false>,  &copy_untyped_large_sse2<1, 8,  false>,
        &copy_untyped_large_sse2<1, 16, false>,  &copy_untyped_large_sse2<1, 32, false>
      },{
        &copy_untyped_large_sse2<1, 1,  true>,   &copy_untyped_large_sse2<1, 2,  true>,
        &copy_untyped_large_sse2<1, 4,  true>,   &copy_untyped_large_sse2<1, 8,  true>,
        &copy_untyped_large_sse2<1, 16, true>,   &copy_untyped_large_sse2<1, 32, true>
      }
    },{
      {
        &copy_untyped_large_sse2<2, 1,  false>,  &copy_untyped_large_sse2<2, 2,  false>,
        &copy_untyped_large_sse2<2, 4,  false>,  &copy_untyped_large_sse2<2, 8,  false>,
        &copy_untyped_large_sse2<2, 16, false>,  &copy_untyped_large_sse2<2, 32, false>
      },{
        &copy_untyped_large_sse2<2, 1,  false>,  &copy_untyped_large_sse2<2, 2,  false>,
        &copy_untyped_large_sse2<2, 4,  false>,  &copy_untyped_large_sse2<2, 8,  false>,
        &copy_untyped_large_sse2<2, 16, false>,  &copy_untyped_large_sse2<2, 32, false>
      }
    },{
      {
        &copy_untyped_large_sse2<4, 1,  false>,  &copy_untyped_large_sse2<4, 2,  false>,
        &copy_untyped_large_sse2<4, 4,  false>,  &copy_untyped_large_sse2<4, 8,  false>,
        &copy_untyped_large_sse2<4, 16, false>,  &copy_untyped_large_sse2<4, 32, false>
      },{
        &copy_untyped_large_sse2<4, 1,  true>,   &copy_untyped_large_sse2<4, 2,  true>,
        &copy_untyped_large_sse2<4, 4,  true>,   &copy_untyped_large_sse2<4, 8,  true>,
        &copy_untyped_large_sse2<4, 16, true>,   &copy_untyped_large_sse2<4, 32, true>
      }
    },{
      {
        &copy_untyped_large_sse2<8, 1,  false>,  &copy_untyped_large_sse2<8, 2,  false>,
        &copy_untyped_large_sse2<8, 4,  false>,  &copy_untyped_large_sse2<8, 8,  false>,
        &copy_untyped_large_sse2<8, 16, false>,  &copy_untyped_large_sse2<8, 32, false>
      },{
        &copy_untyped_large_sse2<8, 1,  true>,   &copy_untyped_large_sse2<8, 2,  true>,
        &copy_untyped_large_sse2<8, 4,  true>,   &copy_untyped_large_sse2<8, 8,  true>,
        &copy_untyped_large_sse2<8, 16, true>,   &copy_untyped_large_sse2<8, 32, true>
      }
    },{
      {
        &copy_untyped_large_sse2<16, 1,  false>, &copy_untyped_large_sse2<16, 2,  false>,
        &copy_untyped_large_sse2<16, 4,  false>, &copy_untyped_large_sse2<16, 8,  false>,
        &copy_untyped_large_sse2<16, 16, false>, &copy_untyped_large_sse2<16, 32, false>
      },{
        &copy_untyped_large_sse2<16, 1,  true>,  &copy_untyped_large_sse2<16, 2,  true>,
        &copy_untyped_large_sse2<16, 4,  true>,  &copy_untyped_large_sse2<16, 8,  true>,
        &copy_untyped_large_sse2<16, 16, true>,  &copy_untyped_large_sse2<16, 32, true>
      }
    },{
      {
        &copy_untyped_large_sse2<32, 1,  false>, &copy_untyped_large_sse2<32, 2,  false>,
        &copy_untyped_large_sse2<32, 4,  false>, &copy_untyped_large_sse2<32, 8,  false>,
        &copy_untyped_large_sse2<32, 16, false>, &copy_untyped_large_sse2<32, 32, false>
      },{
        &copy_untyped_large_sse2<32, 1,  true>,  &copy_untyped_large_sse2<32, 2,  true>,
        &copy_untyped_large_sse2<32, 4,  true>,  &copy_untyped_large_sse2<32, 8,  true>,
        &copy_untyped_large_sse2<32, 16, true>,  &copy_untyped_large_sse2<32, 32, true>
      }
    }
  };

  // Medium sized copy up to L2 cache size to utilize prefetching bonus.
  static constexpr const Size L2_CACHE_SIZE = 0x200000; // 2 MiB
  TABLE[src_alignment][bytes_ > L2_CACHE_SIZE][dst_alignment](dst_, src_, bytes_);
}
#endif // defined(USE_SSE2)

#if defined(USE_AVX)
// STREAMING implies DST_ALIGNMENT. Cannot use streaming store without aligned
// destination.
template<Size SRC_ALIGNMENT, Size DST_ALIGNMENT, bool STREAMING>
RX_HINT_NO_INLINE static void copy_untyped_large_avx(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) {
  auto src = reinterpret_cast<const Byte*>(src_);
  auto dst = reinterpret_cast<Byte*>(dst_);

  Size bytes = bytes_;

  _mm_prefetch(reinterpret_cast<const char*>(src), _MM_HINT_NTA);

  for (; bytes >= 256; bytes -= 256) {
    __m256i m0, m1, m2, m3, m4, m5, m6, m7;

    if constexpr (SRC_ALIGNMENT % 32 == 0) {
      m0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 0);
      m1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 1);
      m2 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 2);
      m3 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 3);
      m4 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 4);
      m5 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 5);
      m6 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 6);
      m7 = _mm256_load_si256(reinterpret_cast<const __m256i*>(src) + 7);
    } else {
      m0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 0);
      m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 1);
      m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 2);
      m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 3);
      m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 4);
      m5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 5);
      m6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 6);
      m7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src) + 7);
    }

    _mm_prefetch(reinterpret_cast<const char*>(src + 512), _MM_HINT_NTA);
    src += 256; // This needs to be immediately after _mm_prefetch.

    if constexpr (DST_ALIGNMENT % 32 == 0 && STREAMING) {
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 0, m0);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 1, m1);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 2, m2);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 3, m3);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 4, m4);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 5, m5);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 6, m6);
      _mm256_stream_si256(reinterpret_cast<__m256i*>(dst) + 7, m7);
    } else if constexpr (DST_ALIGNMENT % 32 == 0) {
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 0, m0);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 1, m1);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 2, m2);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 3, m3);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 4, m4);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 5, m5);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 6, m6);
      _mm256_store_si256(reinterpret_cast<__m256i*>(dst) + 7, m7);
    } else {
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 0, m0);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 1, m1);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 2, m2);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 3, m3);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 4, m4);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 5, m5);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 6, m6);
      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst) + 7, m7);
    }
    dst += 256;
  }

  _mm_sfence();

  // Update out parameters.
  src_ = src;
  dst_ = dst;
  bytes_ = bytes;
}

RX_HINT_FORCE_INLINE static void copy_untyped_large_avx_dispatch(Byte *RX_HINT_RESTRICT &dst_, const Byte *RX_HINT_RESTRICT &src_, Size &bytes_) {
  // The alignment should be 16 or 32, thus bit_search_lsb should yield an exponent of 4 or 5.
  const auto dst_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(dst_)), 5_z);
  const auto src_alignment = Algorithm::min(bit_search_lsb(reinterpret_cast<UintPtr>(src_)), 5_z);
  static constexpr void (*const TABLE[6][2][6])(Byte *RX_HINT_RESTRICT&, const Byte *RX_HINT_RESTRICT&, Size&) = {
    {
      {
        &copy_untyped_large_avx<1, 1,  false>,  &copy_untyped_large_avx<1, 2,  false>,
        &copy_untyped_large_avx<1, 4,  false>,  &copy_untyped_large_avx<1, 8,  false>,
        &copy_untyped_large_avx<1, 16, false>,  &copy_untyped_large_avx<1, 32, false>
      },{
        &copy_untyped_large_avx<1, 1,  true>,   &copy_untyped_large_avx<1, 2,  true>,
        &copy_untyped_large_avx<1, 4,  true>,   &copy_untyped_large_avx<1, 8,  true>,
        &copy_untyped_large_avx<1, 16, true>,   &copy_untyped_large_avx<1, 32, true>
      }
    },{
      {
        &copy_untyped_large_avx<2, 1,  false>,  &copy_untyped_large_avx<2, 2,  false>,
        &copy_untyped_large_avx<2, 4,  false>,  &copy_untyped_large_avx<2, 8,  false>,
        &copy_untyped_large_avx<2, 16, false>,  &copy_untyped_large_avx<2, 32, false>
      },{
        &copy_untyped_large_avx<2, 1,  false>,  &copy_untyped_large_avx<2, 2,  false>,
        &copy_untyped_large_avx<2, 4,  false>,  &copy_untyped_large_avx<2, 8,  false>,
        &copy_untyped_large_avx<2, 16, false>,  &copy_untyped_large_avx<2, 32, false>
      }
    },{
      {
        &copy_untyped_large_avx<4, 1,  false>,  &copy_untyped_large_avx<4, 2,  false>,
        &copy_untyped_large_avx<4, 4,  false>,  &copy_untyped_large_avx<4, 8,  false>,
        &copy_untyped_large_avx<4, 16, false>,  &copy_untyped_large_avx<4, 32, false>
      },{
        &copy_untyped_large_avx<4, 1,  true>,   &copy_untyped_large_avx<4, 2,  true>,
        &copy_untyped_large_avx<4, 4,  true>,   &copy_untyped_large_avx<4, 8,  true>,
        &copy_untyped_large_avx<4, 16, true>,   &copy_untyped_large_avx<4, 32, true>
      }
    },{
      {
        &copy_untyped_large_avx<8, 1,  false>,  &copy_untyped_large_avx<8, 2,  false>,
        &copy_untyped_large_avx<8, 4,  false>,  &copy_untyped_large_avx<8, 8,  false>,
        &copy_untyped_large_avx<8, 16, false>,  &copy_untyped_large_avx<8, 32, false>
      },{
        &copy_untyped_large_avx<8, 1,  true>,   &copy_untyped_large_avx<8, 2,  true>,
        &copy_untyped_large_avx<8, 4,  true>,   &copy_untyped_large_avx<8, 8,  true>,
        &copy_untyped_large_avx<8, 16, true>,   &copy_untyped_large_avx<8, 32, true>
      }
    },{
      {
        &copy_untyped_large_avx<16, 1,  false>, &copy_untyped_large_avx<16, 2,  false>,
        &copy_untyped_large_avx<16, 4,  false>, &copy_untyped_large_avx<16, 8,  false>,
        &copy_untyped_large_avx<16, 16, false>, &copy_untyped_large_avx<16, 32, false>
      },{
        &copy_untyped_large_avx<16, 1,  true>,  &copy_untyped_large_avx<16, 2,  true>,
        &copy_untyped_large_avx<16, 4,  true>,  &copy_untyped_large_avx<16, 8,  true>,
        &copy_untyped_large_avx<16, 16, true>,  &copy_untyped_large_avx<16, 32, true>
      }
    },{
      {
        &copy_untyped_large_avx<32, 1,  false>, &copy_untyped_large_avx<32, 2,  false>,
        &copy_untyped_large_avx<32, 4,  false>, &copy_untyped_large_avx<32, 8,  false>,
        &copy_untyped_large_avx<32, 16, false>, &copy_untyped_large_avx<32, 32, false>
      },{
        &copy_untyped_large_avx<32, 1,  true>,  &copy_untyped_large_avx<32, 2,  true>,
        &copy_untyped_large_avx<32, 4,  true>,  &copy_untyped_large_avx<32, 8,  true>,
        &copy_untyped_large_avx<32, 16, true>,  &copy_untyped_large_avx<32, 32, true>
      }
    }
  };

  // Medium sized copy up to L2 cache size to utilize prefetching bonus.
  static constexpr const Size L2_CACHE_SIZE = 0x200000; // 2 MiB
  TABLE[src_alignment][bytes_ > L2_CACHE_SIZE][dst_alignment](dst_, src_, bytes_);
}
#endif // defined(USE_AVX)

void* copy_untyped(void *RX_HINT_RESTRICT dst_, const void *RX_HINT_RESTRICT _src, Size _bytes) {
  auto dst = reinterpret_cast<Byte*>(dst_);
  auto src = reinterpret_cast<const Byte*>(_src);

  // Help check for undefined calls to memcpy.
  RX_ASSERT(dst, "null destination");
  RX_ASSERT(src, "null source");

  if (_bytes <= SMALL_SIZE) {
    copy_untyped_small_dispatch(dst, src, _bytes);
    return dst_;
  }

#if defined(USE_AVX)
  // Copies larger than SMALL_SIZE bytes, process with AVX if available.
  const auto padding = (32 - (reinterpret_cast<UintPtr>(dst) & 31)) & 31;
  _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
  dst += padding;
  src += padding;
  _bytes -= padding;
  copy_untyped_large_avx_dispatch(dst, src, _bytes);
  copy_untyped_small_dispatch(dst, src, _bytes);
#elif defined(USE_SSE2)
  // Copies larger than SMALL_SIZE bytes, process with SSE2 if available.
  // Align destination to 16-byte boundary.
  const auto padding = (16 - (reinterpret_cast<UintPtr>(dst) & 15)) & 15;
  // Handle padding bytes first, if any.
  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)));
  dst += padding;
  src += padding;
  _bytes -= padding;
  copy_untyped_large_sse2_dispatch(dst, src, _bytes);
  copy_untyped_small_dispatch(dst, src, _bytes);
#else
#if defined(RX_BYTE_ORDER_LITTLE_ENDIAN)
#define LS(x, y) ((x) >> (y))
#define RS(x, y) ((x) << (y))
#else
#define LS(x, y) ((x) << (y))
#define RS(x, y) ((x) >> (y))
#endif
  // Align to 4-byte boundary. 8-byte boundary was also tested but ended up being
  // about 0.33x slower on average across AARCH64, AMD64, and WASM32.
  for (; reinterpret_cast<UintPtr>(src) % 4 && _bytes; _bytes--) {
    *dst++ = *src++;
  }

  // The source and destination can be up to 4-byte aligned, but it would also
  // be helpful to use 8-byte load and store on 64-bit architectures that
  // support unaligned scalar load and stores. This helps AARCH64 quite a bit.
  // Do not do this for 32-bit architectures though
#if defined(ALLOW_UNALIGNED_SCALAR_LOAD) && defined(ALLOW_UNALIGNED_SCALAR_STORE)
  static inline constexpr const bool ALLOW_64_UNALIGNED = sizeof(void*) == 8;
#else
  static inline constexpr const bool ALLOW_64_UNALIGNED = false;
#endif

  // Destination is aligned on 4-byte boundary.
  if (reinterpret_cast<UintPtr>(dst) % 4 == 0) {
    for (; _bytes >= 16; src += 16, dst += 16, _bytes -= 16) {
      if constexpr (ALLOW_64_UNALIGNED) {
        storea64(dst + 0,  loada64(src + 0));
        storea64(dst + 8,  loada64(src + 8));
      } else {
        storea32(dst + 0,  loada32(src + 0));
        storea32(dst + 4,  loada32(src + 4));
        storea32(dst + 8,  loada32(src + 8));
        storea32(dst + 12, loada32(src + 12));
      }
    }
    if (_bytes & 8) {
      if constexpr (ALLOW_64_UNALIGNED) {
        storea64(dst + 0, loada64(src + 0));
      } else {
        storea32(dst + 0, loada32(src + 0));
        storea32(dst + 4, loada32(src + 4));
      }
      dst += 8;
      src += 8;
    }
    if (_bytes & 4) {
      storea32(dst + 0, loada32(src + 0));
      dst += 4;
      src += 4;
    }
    if (_bytes & 2) {
      storea16(dst + 0, loada16(src + 0));
      dst += 2;
      src += 2;
    }
    if (_bytes & 1) {
      *dst = *src;
    }
    return dst_;
  }

  // Handle destination misalignment.
  Uint32 w, x;
  if (_bytes >= 32) switch (reinterpret_cast<UintPtr>(dst) % 4) {
  case 1:
    w = loadu32(src);
    *dst++ = *src++;
    *dst++ = *src++;
    *dst++ = *src++;
    _bytes -= 3;
    for (; _bytes >= 17; src += 16, dst += 16, _bytes -= 16) {
      x = loada32(src + 1);
      storea32(dst + 0,  LS(w, 24) | RS(x, 8));
      w = loada32(src + 5);
      storea32(dst + 4,  LS(x, 24) | RS(w, 8));
      x = loada32(src + 9);
      storea32(dst + 8,  LS(w, 24) | RS(x, 8));
      w = loada32(src + 13);
      storea32(dst + 12, LS(x, 24) | RS(w, 8));
    }
    break;
  case 2:
    w = loadu32(src);
    *dst++ = *src++;
    *dst++ = *src++;
    _bytes -= 2;
    for (; _bytes >= 18; src += 16, dst += 16, _bytes -= 16) {
      x = loada32(src + 2);
      storea32(dst + 0,  LS(w, 16) | RS(x, 16));
      w = loada32(src + 6);
      storea32(dst + 4,  LS(x, 16) | RS(w, 16));
      x = loada32(src + 10);
      storea32(dst + 8,  LS(w, 16) | RS(x, 16));
      w = loada32(src + 14);
      storea32(dst + 12, LS(x, 16) | RS(w, 16));
    }
    break;
  case 3:
      w = loadu32(src);
    *dst++ = *src++;
    _bytes -= 1;
    for (; _bytes >= 19; src += 16, dst += 16, _bytes -= 16) {
      x = loada32(src + 3);
      storea32(dst + 0,  LS(w, 8) | RS(x, 24));
      w = loada32(src + 7);
      storea32(dst + 4,  LS(x, 8) | RS(w, 24));
      x = loada32(src + 11);
      storea32(dst + 8,  LS(w, 8) | RS(x, 24));
      w = loada32(src + 15);
      storea32(dst + 12, LS(x, 8) | RS(w, 24));
    }
    break;
  }
  if (_bytes & 16) {
    if constexpr (ALLOW_64_UNALIGNED) {
      storea64(dst + 0,  loada64(src + 0));
      storea64(dst + 8,  loada64(src + 8));
    } else {
      storea32(dst + 0,  loada32(src + 0));
      storea32(dst + 4,  loada32(src + 4));
      storea32(dst + 8,  loada32(src + 8));
      storea32(dst + 12, loada32(src + 12));
    }
    src += 16;
    dst += 16;
  }
  if (_bytes & 8) {
    if constexpr (ALLOW_64_UNALIGNED) {
      storea64(dst + 0, loada64(src + 0));
    } else {
      storea32(dst + 0, loada64(src + 0));
      storea32(dst + 4, loada64(src + 4));
    }
    src += 8;
    dst += 8;
  }
  if (_bytes & 4) {
    storea32(dst + 0, loada64(src + 0));
    src += 4;
    dst += 4;
  }
  if (_bytes & 2) {
    storea16(dst + 0, loada16(src + 0));
    src += 2;
    dst += 2;
  }
  if (_bytes & 1) {
    *dst = *src;
  }
#endif
  return dst_;
}

} // namespace Rx::Memory