Skip to content

Instantly share code, notes, and snippets.

@JinShil
Created June 6, 2019 12:38
Show Gist options
  • Save JinShil/f2a6b288fd00e81222d7f18e0deeeef0 to your computer and use it in GitHub Desktop.
Save JinShil/f2a6b288fd00e81222d7f18e0deeeef0 to your computer and use it in GitHub Desktop.
rte_memcpy in D
import core.simd: void16, loadUnaligned, storeUnaligned;
pragma(inline, true)
void rte_mov16(ubyte *dst, ubyte*src) {
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
}
pragma(inline, true)
void rte_mov32(ubyte *dst, ubyte *src) {
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
}
pragma(inline, true)
void rte_mov48(ubyte *dst, ubyte *src) {
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
}
pragma(inline, true)
void rte_mov64(ubyte *dst, ubyte *src) {
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48)));
}
pragma(inline, true)
void rte_mov128(ubyte *dst, ubyte *src) {
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48)));
storeUnaligned(cast(void16*)(dst + 64), loadUnaligned(cast(void16*)(src + 64)));
storeUnaligned(cast(void16*)(dst + 80), loadUnaligned(cast(void16*)(src + 80)));
storeUnaligned(cast(void16*)(dst + 96), loadUnaligned(cast(void16*)(src + 96)));
storeUnaligned(cast(void16*)(dst + 112), loadUnaligned(cast(void16*)(src + 112)));
}
pragma(inline, false)
void rte_mov256(ubyte *dst, ubyte *src) {
rte_mov128(dst, src);
rte_mov128(dst + 128, src + 128);
}
void *rte_memcpy_func(void *dst, void *src, size_t n) {
void *ret = dst;
/* We can't copy < 16 bytes using XMM registers so do it manually. */
if (n < 16) {
if (n & 0x01) {
*cast(ubyte *)dst = *cast(ubyte *)src;
dst = cast(ubyte *)dst + 1;
src = cast(ubyte *)src + 1;
}
if (n & 0x02) {
*cast(ushort *)dst = *cast(ushort *)src;
dst = cast(ushort *)dst + 1;
src = cast(ushort *)src + 1;
}
if (n & 0x04) {
*cast(uint *)dst = *cast(uint *)src;
dst = cast(uint *)dst + 1;
src = cast(uint *)src + 1;
}
if (n & 0x08) {
*cast(ulong *)dst = *cast(ulong *)src;
}
return ret;
}
/* Special fast cases for <= 128 bytes */
if (n <= 32) {
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n);
return ret;
}
if (n <= 64) {
rte_mov32(cast(ubyte *)dst, cast(ubyte *)src);
rte_mov32(cast(ubyte *)dst - 32 + n, cast(ubyte *)src - 32 + n);
return ret;
}
if (n <= 128) {
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
rte_mov64(cast(ubyte *)dst - 64 + n, cast(ubyte *)src - 64 + n);
return ret;
}
/*
* For large copies > 128 bytes. This combination of 256, 64 and 16 byte
* copies was found to be faster than doing 128 and 32 byte copies as
* well.
*/
for ( ; n >= 256; n -= 256) {
rte_mov256(cast(ubyte *)dst, cast(ubyte *)src);
dst = cast(ubyte *)dst + 256;
src = cast(ubyte *)src + 256;
}
/*
* We split the remaining bytes (which will be less than 256) into
* 64byte (2^6) chunks.
* Using incrementing integers in the case labels of a switch statement
* enourages the compiler to use a jump table. To get incrementing
* integers, we shift the 2 relevant bits to the LSB position to first
* get decrementing integers, and then subtract.
*/
switch (3 - (n >> 6)) {
case 0x00:
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
n -= 64;
dst = cast(ubyte *)dst + 64;
src = cast(ubyte *)src + 64; /* fallthrough */
case 0x01:
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
n -= 64;
dst = cast(ubyte *)dst + 64;
src = cast(ubyte *)src + 64; /* fallthrough */
case 0x02:
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
n -= 64;
dst = cast(ubyte *)dst + 64;
src = cast(ubyte *)src + 64; /* fallthrough */
default: break;
}
/*
* We split the remaining bytes (which will be less than 64) into
* 16byte (2^4) chunks, using the same switch structure as above.
*/
switch (3 - (n >> 4)) {
case 0x00:
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
n -= 16;
dst = cast(ubyte *)dst + 16;
src = cast(ubyte *)src + 16; /* fallthrough */
case 0x01:
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
n -= 16;
dst = cast(ubyte *)dst + 16;
src = cast(ubyte *)src + 16; /* fallthrough */
case 0x02:
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
n -= 16;
dst = cast(ubyte *)dst + 16;
src = cast(ubyte *)src + 16; /* fallthrough */
default: break;
}
/* Copy any remaining bytes, without going beyond end of buffers */
if (n != 0) {
rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n);
}
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment