Created
June 6, 2019 12:38
-
-
Save JinShil/f2a6b288fd00e81222d7f18e0deeeef0 to your computer and use it in GitHub Desktop.
rte_memcpy in D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import core.simd: void16, loadUnaligned, storeUnaligned; | |
pragma(inline, true) | |
void rte_mov16(ubyte *dst, ubyte*src) { | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src))); | |
} | |
pragma(inline, true) | |
void rte_mov32(ubyte *dst, ubyte *src) { | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src))); | |
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16))); | |
} | |
pragma(inline, true) | |
void rte_mov48(ubyte *dst, ubyte *src) { | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src))); | |
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16))); | |
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32))); | |
} | |
pragma(inline, true) | |
void rte_mov64(ubyte *dst, ubyte *src) { | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src))); | |
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16))); | |
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32))); | |
storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48))); | |
} | |
pragma(inline, true) | |
void rte_mov128(ubyte *dst, ubyte *src) { | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src))); | |
storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16))); | |
storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32))); | |
storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48))); | |
storeUnaligned(cast(void16*)(dst + 64), loadUnaligned(cast(void16*)(src + 64))); | |
storeUnaligned(cast(void16*)(dst + 80), loadUnaligned(cast(void16*)(src + 80))); | |
storeUnaligned(cast(void16*)(dst + 96), loadUnaligned(cast(void16*)(src + 96))); | |
storeUnaligned(cast(void16*)(dst + 112), loadUnaligned(cast(void16*)(src + 112))); | |
} | |
pragma(inline, false) | |
void rte_mov256(ubyte *dst, ubyte *src) { | |
rte_mov128(dst, src); | |
rte_mov128(dst + 128, src + 128); | |
} | |
void *rte_memcpy_func(void *dst, void *src, size_t n) { | |
void *ret = dst; | |
/* We can't copy < 16 bytes using XMM registers so do it manually. */ | |
if (n < 16) { | |
if (n & 0x01) { | |
*cast(ubyte *)dst = *cast(ubyte *)src; | |
dst = cast(ubyte *)dst + 1; | |
src = cast(ubyte *)src + 1; | |
} | |
if (n & 0x02) { | |
*cast(ushort *)dst = *cast(ushort *)src; | |
dst = cast(ushort *)dst + 1; | |
src = cast(ushort *)src + 1; | |
} | |
if (n & 0x04) { | |
*cast(uint *)dst = *cast(uint *)src; | |
dst = cast(uint *)dst + 1; | |
src = cast(uint *)src + 1; | |
} | |
if (n & 0x08) { | |
*cast(ulong *)dst = *cast(ulong *)src; | |
} | |
return ret; | |
} | |
/* Special fast cases for <= 128 bytes */ | |
if (n <= 32) { | |
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src); | |
rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n); | |
return ret; | |
} | |
if (n <= 64) { | |
rte_mov32(cast(ubyte *)dst, cast(ubyte *)src); | |
rte_mov32(cast(ubyte *)dst - 32 + n, cast(ubyte *)src - 32 + n); | |
return ret; | |
} | |
if (n <= 128) { | |
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src); | |
rte_mov64(cast(ubyte *)dst - 64 + n, cast(ubyte *)src - 64 + n); | |
return ret; | |
} | |
/* | |
* For large copies > 128 bytes. This combination of 256, 64 and 16 byte | |
* copies was found to be faster than doing 128 and 32 byte copies as | |
* well. | |
*/ | |
for ( ; n >= 256; n -= 256) { | |
rte_mov256(cast(ubyte *)dst, cast(ubyte *)src); | |
dst = cast(ubyte *)dst + 256; | |
src = cast(ubyte *)src + 256; | |
} | |
/* | |
* We split the remaining bytes (which will be less than 256) into | |
* 64byte (2^6) chunks. | |
* Using incrementing integers in the case labels of a switch statement | |
* enourages the compiler to use a jump table. To get incrementing | |
* integers, we shift the 2 relevant bits to the LSB position to first | |
* get decrementing integers, and then subtract. | |
*/ | |
switch (3 - (n >> 6)) { | |
case 0x00: | |
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 64; | |
dst = cast(ubyte *)dst + 64; | |
src = cast(ubyte *)src + 64; /* fallthrough */ | |
case 0x01: | |
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 64; | |
dst = cast(ubyte *)dst + 64; | |
src = cast(ubyte *)src + 64; /* fallthrough */ | |
case 0x02: | |
rte_mov64(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 64; | |
dst = cast(ubyte *)dst + 64; | |
src = cast(ubyte *)src + 64; /* fallthrough */ | |
default: break; | |
} | |
/* | |
* We split the remaining bytes (which will be less than 64) into | |
* 16byte (2^4) chunks, using the same switch structure as above. | |
*/ | |
switch (3 - (n >> 4)) { | |
case 0x00: | |
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 16; | |
dst = cast(ubyte *)dst + 16; | |
src = cast(ubyte *)src + 16; /* fallthrough */ | |
case 0x01: | |
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 16; | |
dst = cast(ubyte *)dst + 16; | |
src = cast(ubyte *)src + 16; /* fallthrough */ | |
case 0x02: | |
rte_mov16(cast(ubyte *)dst, cast(ubyte *)src); | |
n -= 16; | |
dst = cast(ubyte *)dst + 16; | |
src = cast(ubyte *)src + 16; /* fallthrough */ | |
default: break; | |
} | |
/* Copy any remaining bytes, without going beyond end of buffers */ | |
if (n != 0) { | |
rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n); | |
} | |
return ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment