Created
June 14, 2019 09:27
-
-
Save JinShil/f0948d381a5df40958a47dc2e4cde80a to your computer and use it in GitHub Desktop.
memcpy implemented in D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Boost Software License - Version 1.0 - August 17th, 2003 | |
Permission is hereby granted, free of charge, to any person or organization | |
obtaining a copy of the software and accompanying documentation covered by | |
this license (the "Software") to use, reproduce, display, distribute, | |
execute, and transmit the Software, and to prepare derivative works of the | |
Software, and to permit third-parties to whom the Software is furnished to | |
do so, all subject to the following: | |
The copyright notices in the Software and this entire statement, including | |
the above license grant, this restriction and the following disclaimer, | |
must be included in all copies of the Software, in whole or in part, and | |
all derivative works of the Software, unless such copies or derivative | |
works are solely in the form of machine-executable object code generated by | |
a source language processor. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | |
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | |
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | |
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
DEALINGS IN THE SOFTWARE. | |
*/ | |
import std.datetime.stopwatch; | |
import core.stdc.string; | |
import S_struct; | |
import std.random; | |
import std.traits; | |
import std.stdio; | |
bool isPowerOf2(T)(T x) | |
if (isIntegral!T) | |
{ | |
return (x != 0) && ((x & (x - 1)) == 0); | |
} | |
struct Increment | |
{ | |
size_t size; | |
size_t offset; | |
} | |
struct IncrementRange | |
{ | |
Increment i; | |
size_t remainingSize; | |
size_t blockSize; | |
this(size_t n) | |
{ | |
blockSize = n; | |
remainingSize = n; | |
popFront(); | |
} | |
@property bool empty() | |
{ | |
return i.size <= 0; | |
} | |
@property Increment front() | |
{ | |
return i; | |
} | |
void popFront() | |
{ | |
import core.bitop: bsr; | |
// NOTE: This algorithm in naive. It should be improved. | |
i.offset += i.size; | |
if (remainingSize != 0) | |
{ | |
i.size = 1LU << bsr(remainingSize); | |
} | |
else | |
{ | |
i.size = 0; | |
return; | |
} | |
// Example: For size 7 | |
// Instead of copying.. | |
// 4 bytes at offset 0 | |
// 2 bytes at offset 4 | |
// 1 byte at offset 6 | |
// copy.. | |
// 4 bytes at offset 0 | |
// 4 bytes at offset 3 | |
// Some bytes are copied twice, but its faster | |
if (blockSize > 4 && remainingSize < blockSize && i.size < 4) | |
{ | |
i.size = 4; | |
i.offset = blockSize - 4; | |
remainingSize = 0; | |
} | |
else if (blockSize > 8 && remainingSize < blockSize && i.size < 8) | |
{ | |
i.size = 8; | |
i.offset = blockSize - 8; | |
remainingSize = 0; | |
} | |
else if (blockSize > 16 && remainingSize < blockSize && i.size < 16) | |
{ | |
i.size = 16; | |
i.offset = blockSize - 16; | |
remainingSize = 0; | |
} | |
else | |
{ | |
remainingSize -= i.size; | |
} | |
} | |
} | |
void Cmemcpy(T)(T *dst, const T *src) | |
{ | |
pragma(inline, true) | |
memcpy(dst, src, T.sizeof); | |
} | |
void Dmemcpy(T)(T *dst, const T *src) | |
if (isScalarType!T) | |
{ | |
pragma(inline, true) | |
*dst = *src; | |
} | |
void Dmemcpy(T)(T *dst, const T *src) | |
if (is(T == struct)) | |
{ | |
static if (T.sizeof == 1) | |
{ | |
pragma(inline, true) | |
Dmemcpy(cast(ubyte*)(dst), cast(const ubyte*)(src)); | |
return; | |
} | |
else static if (T.sizeof == 2) | |
{ | |
pragma(inline, true) | |
Dmemcpy(cast(ushort*)(dst), cast(const ushort*)(src)); | |
return; | |
} | |
else static if (T.sizeof == 4) | |
{ | |
pragma(inline, true) | |
Dmemcpy(cast(uint*)(dst), cast(const uint*)(src)); | |
return; | |
} | |
else static if (T.sizeof == 8) | |
{ | |
pragma(inline, true) | |
Dmemcpy(cast(ulong*)(dst), cast(const ulong*)(src)); | |
return; | |
} | |
else static if (T.sizeof == 16) | |
{ | |
pragma(inline, true) | |
import core.simd: void16, storeUnaligned, loadUnaligned; | |
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(const void16*)(src))); | |
return; | |
} | |
else static if (T.sizeof <= 64 && (T.sizeof % 16) == 0) | |
{ | |
pragma(inline, true) | |
import core.simd: void16, storeUnaligned, loadUnaligned; | |
static foreach(i; 0 .. T.sizeof/16) | |
{ | |
storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i)); | |
} | |
return; | |
} | |
// Could not get this to inline for greater than 88 | |
else static if (T.sizeof < 88 && !isPowerOf2(T.sizeof)) | |
{ | |
// pragma(inline, true) // DMD BUG! causes compiler to crash | |
static foreach(i; IncrementRange(T.sizeof)) | |
{ | |
pragma(inline, true) | |
Dmemcpy( | |
cast(S!(i.size)*)(cast(ubyte*)dst + i.offset), | |
cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset)); | |
} | |
return; | |
} | |
else static if (T.sizeof == 128) | |
{ | |
// pragma(inline, true) | |
import core.simd; | |
prefetch!(false, 3)(src+0x1a0); | |
prefetch!(false, 3)(src+0x260); | |
static foreach(i; 0 .. T.sizeof/16) | |
{ | |
storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i)); | |
} | |
} | |
// Sizes 209 ~ 256 perform poorly in this block for unknown reasons | |
else static if (T.sizeof < 209 && !isPowerOf2(T.sizeof)) | |
{ | |
// pragma(inline, true) | |
static foreach(i; IncrementRange(T.sizeof)) | |
{ | |
Dmemcpy( | |
cast(S!(i.size)*)(cast(ubyte*)dst + i.offset), | |
cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset)); | |
} | |
return; | |
} | |
else | |
{ | |
pragma(inline, false) | |
asm pure nothrow @nogc | |
{ | |
mov RDX, T.sizeof; | |
mov ECX, ESI; // save `src` | |
and ECX, 0x1f; // mod = src % 32 | |
je L4; | |
// if (mod) -> copy enough bytes to reach 32-byte alignment | |
vmovdqu YMM0, [RSI]; | |
vmovdqu [RDI], YMM0; | |
// %t0 = 32 - mod | |
mov RAX, 0x20; | |
sub RAX, RCX; | |
//cdqe ; | |
// src += %t0 | |
add RSI, RAX; | |
// dst += %t0 | |
add RDI, RAX; | |
// n -= %t0 | |
sub RDX, RAX; | |
align 16; | |
L4: | |
// Because of the above, (at least) the loads | |
// are 32-byte aligned. | |
vmovdqu YMM0, [RSI]; | |
vmovdqu YMM1, [RSI+0x20]; | |
vmovdqu YMM2, [RSI+0x40]; | |
vmovdqu YMM3, [RSI+0x60]; | |
vmovdqu [RDI], YMM0; | |
vmovdqu [RDI+0x20], YMM1; | |
vmovdqu [RDI+0x40], YMM2; | |
vmovdqu [RDI+0x60], YMM3; | |
// src += 128; | |
add RSI, 128; | |
// dst += 128; | |
add RDI, 128; | |
// n -= 128; | |
sub RDX, 128; | |
// if (n >= 128) loop | |
cmp RDX, 128; | |
jge L4; | |
L2: | |
test RDX, RDX; | |
je L3; | |
// if (n != 0) -> copy the remaining <= 128 bytes | |
vmovdqu YMM0, [RSI]; | |
vmovdqu YMM1, [RSI+0x20]; | |
vmovdqu [RDI], YMM0; | |
vmovdqu [RDI+0x20], YMM1; | |
sub RDX, 0x40; | |
add RSI, RDX; | |
add RDI, RDX; | |
vmovdqu YMM0, [RSI]; | |
vmovdqu YMM1, [RSI+0x20]; | |
vmovdqu [RDI], YMM0; | |
vmovdqu [RDI+0x20], YMM1; | |
L3: | |
vzeroupper; | |
} | |
return; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment