Skip to content

Instantly share code, notes, and snippets.

@JinShil
Created June 14, 2019 09:27
Show Gist options
  • Save JinShil/f0948d381a5df40958a47dc2e4cde80a to your computer and use it in GitHub Desktop.
Save JinShil/f0948d381a5df40958a47dc2e4cde80a to your computer and use it in GitHub Desktop.
memcpy implemented in D
/*
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
import std.datetime.stopwatch;
import core.stdc.string;
import S_struct;
import std.random;
import std.traits;
import std.stdio;
bool isPowerOf2(T)(T x)
if (isIntegral!T)
{
return (x != 0) && ((x & (x - 1)) == 0);
}
struct Increment
{
size_t size;
size_t offset;
}
struct IncrementRange
{
Increment i;
size_t remainingSize;
size_t blockSize;
this(size_t n)
{
blockSize = n;
remainingSize = n;
popFront();
}
@property bool empty()
{
return i.size <= 0;
}
@property Increment front()
{
return i;
}
void popFront()
{
import core.bitop: bsr;
// NOTE: This algorithm in naive. It should be improved.
i.offset += i.size;
if (remainingSize != 0)
{
i.size = 1LU << bsr(remainingSize);
}
else
{
i.size = 0;
return;
}
// Example: For size 7
// Instead of copying..
// 4 bytes at offset 0
// 2 bytes at offset 4
// 1 byte at offset 6
// copy..
// 4 bytes at offset 0
// 4 bytes at offset 3
// Some bytes are copied twice, but its faster
if (blockSize > 4 && remainingSize < blockSize && i.size < 4)
{
i.size = 4;
i.offset = blockSize - 4;
remainingSize = 0;
}
else if (blockSize > 8 && remainingSize < blockSize && i.size < 8)
{
i.size = 8;
i.offset = blockSize - 8;
remainingSize = 0;
}
else if (blockSize > 16 && remainingSize < blockSize && i.size < 16)
{
i.size = 16;
i.offset = blockSize - 16;
remainingSize = 0;
}
else
{
remainingSize -= i.size;
}
}
}
void Cmemcpy(T)(T *dst, const T *src)
{
pragma(inline, true)
memcpy(dst, src, T.sizeof);
}
void Dmemcpy(T)(T *dst, const T *src)
if (isScalarType!T)
{
pragma(inline, true)
*dst = *src;
}
void Dmemcpy(T)(T *dst, const T *src)
if (is(T == struct))
{
static if (T.sizeof == 1)
{
pragma(inline, true)
Dmemcpy(cast(ubyte*)(dst), cast(const ubyte*)(src));
return;
}
else static if (T.sizeof == 2)
{
pragma(inline, true)
Dmemcpy(cast(ushort*)(dst), cast(const ushort*)(src));
return;
}
else static if (T.sizeof == 4)
{
pragma(inline, true)
Dmemcpy(cast(uint*)(dst), cast(const uint*)(src));
return;
}
else static if (T.sizeof == 8)
{
pragma(inline, true)
Dmemcpy(cast(ulong*)(dst), cast(const ulong*)(src));
return;
}
else static if (T.sizeof == 16)
{
pragma(inline, true)
import core.simd: void16, storeUnaligned, loadUnaligned;
storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(const void16*)(src)));
return;
}
else static if (T.sizeof <= 64 && (T.sizeof % 16) == 0)
{
pragma(inline, true)
import core.simd: void16, storeUnaligned, loadUnaligned;
static foreach(i; 0 .. T.sizeof/16)
{
storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i));
}
return;
}
// Could not get this to inline for greater than 88
else static if (T.sizeof < 88 && !isPowerOf2(T.sizeof))
{
// pragma(inline, true) // DMD BUG! causes compiler to crash
static foreach(i; IncrementRange(T.sizeof))
{
pragma(inline, true)
Dmemcpy(
cast(S!(i.size)*)(cast(ubyte*)dst + i.offset),
cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset));
}
return;
}
else static if (T.sizeof == 128)
{
// pragma(inline, true)
import core.simd;
prefetch!(false, 3)(src+0x1a0);
prefetch!(false, 3)(src+0x260);
static foreach(i; 0 .. T.sizeof/16)
{
storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i));
}
}
// Sizes 209 ~ 256 perform poorly in this block for unknown reasons
else static if (T.sizeof < 209 && !isPowerOf2(T.sizeof))
{
// pragma(inline, true)
static foreach(i; IncrementRange(T.sizeof))
{
Dmemcpy(
cast(S!(i.size)*)(cast(ubyte*)dst + i.offset),
cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset));
}
return;
}
else
{
pragma(inline, false)
asm pure nothrow @nogc
{
mov RDX, T.sizeof;
mov ECX, ESI; // save `src`
and ECX, 0x1f; // mod = src % 32
je L4;
// if (mod) -> copy enough bytes to reach 32-byte alignment
vmovdqu YMM0, [RSI];
vmovdqu [RDI], YMM0;
// %t0 = 32 - mod
mov RAX, 0x20;
sub RAX, RCX;
//cdqe ;
// src += %t0
add RSI, RAX;
// dst += %t0
add RDI, RAX;
// n -= %t0
sub RDX, RAX;
align 16;
L4:
// Because of the above, (at least) the loads
// are 32-byte aligned.
vmovdqu YMM0, [RSI];
vmovdqu YMM1, [RSI+0x20];
vmovdqu YMM2, [RSI+0x40];
vmovdqu YMM3, [RSI+0x60];
vmovdqu [RDI], YMM0;
vmovdqu [RDI+0x20], YMM1;
vmovdqu [RDI+0x40], YMM2;
vmovdqu [RDI+0x60], YMM3;
// src += 128;
add RSI, 128;
// dst += 128;
add RDI, 128;
// n -= 128;
sub RDX, 128;
// if (n >= 128) loop
cmp RDX, 128;
jge L4;
L2:
test RDX, RDX;
je L3;
// if (n != 0) -> copy the remaining <= 128 bytes
vmovdqu YMM0, [RSI];
vmovdqu YMM1, [RSI+0x20];
vmovdqu [RDI], YMM0;
vmovdqu [RDI+0x20], YMM1;
sub RDX, 0x40;
add RSI, RDX;
add RDI, RDX;
vmovdqu YMM0, [RSI];
vmovdqu YMM1, [RSI+0x20];
vmovdqu [RDI], YMM0;
vmovdqu [RDI+0x20], YMM1;
L3:
vzeroupper;
}
return;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment