JinShil/memcpy.d

## memcpy.d
/*
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/

import std.datetime.stopwatch;
import core.stdc.string;
import S_struct;
import std.random;
import std.traits;
import std.stdio;


bool isPowerOf2(T)(T x)
    if (isIntegral!T)
{
    return (x != 0) && ((x & (x - 1)) == 0);
}

struct Increment
{
	size_t size;
    size_t offset;
}

struct IncrementRange
{
    Increment i;
    size_t remainingSize;
    size_t blockSize;

    this(size_t n)
    {
        blockSize = n;
    	remainingSize = n;
        popFront();
    }

    @property bool empty()
    {
        return i.size <= 0;
    }

    @property Increment front()
    {
        return i;
    }

    void popFront()
    {
        import core.bitop: bsr;

        // NOTE: This algorithm in naive. It should be improved.

        i.offset += i.size;
        if (remainingSize != 0)
        {
        	i.size = 1LU << bsr(remainingSize);
        }
        else
        {
            i.size = 0;
            return;
        }

        // Example:  For size 7
        // Instead of copying..
        //    4 bytes at offset 0
        //    2 bytes at offset 4
        //    1 byte at offset 6
        // copy..
        //    4 bytes at offset 0
        //    4 bytes at offset 3
        // Some bytes are copied twice, but its faster

        if (blockSize > 4 && remainingSize < blockSize && i.size < 4)
        {
            i.size = 4;
            i.offset = blockSize - 4;
            remainingSize = 0;
        }
        else if (blockSize > 8 && remainingSize < blockSize && i.size < 8)
        {
            i.size = 8;
            i.offset = blockSize - 8;
            remainingSize = 0;
        }
        else if (blockSize > 16 && remainingSize < blockSize && i.size < 16)
        {
            i.size = 16;
            i.offset = blockSize - 16;
            remainingSize = 0;
        }
        else
        {
            remainingSize -= i.size;
        }
    }
}

void Cmemcpy(T)(T *dst, const T *src)
{
    pragma(inline, true)
    memcpy(dst, src, T.sizeof);
}

void Dmemcpy(T)(T *dst, const T *src)
    if (isScalarType!T)
{
    pragma(inline, true)
    *dst = *src;
}

void Dmemcpy(T)(T *dst, const T *src)
    if (is(T == struct))
{
    static if (T.sizeof == 1)
    {
        pragma(inline, true)
        Dmemcpy(cast(ubyte*)(dst), cast(const ubyte*)(src));
        return;
    }
    else static if (T.sizeof == 2)
    {
        pragma(inline, true)
        Dmemcpy(cast(ushort*)(dst), cast(const ushort*)(src));
        return;
    }
    else static if (T.sizeof == 4)
    {
        pragma(inline, true)
        Dmemcpy(cast(uint*)(dst), cast(const uint*)(src));
        return;
    }
    else static if (T.sizeof == 8)
    {
        pragma(inline, true)
        Dmemcpy(cast(ulong*)(dst), cast(const ulong*)(src));
        return;
    }
    else static if (T.sizeof == 16)
    {
        pragma(inline, true)
        import core.simd: void16, storeUnaligned, loadUnaligned;
        storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(const void16*)(src)));
        return;
    }
    else static if (T.sizeof <= 64 && (T.sizeof % 16) == 0)
    {
        pragma(inline, true)
        import core.simd: void16, storeUnaligned, loadUnaligned;
        static foreach(i; 0 .. T.sizeof/16)
        {
            storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i));
        }
        return;
    }
    // Could not get this to inline for greater than 88
    else static if (T.sizeof < 88 && !isPowerOf2(T.sizeof))
    {
        // pragma(inline, true) // DMD BUG! causes compiler to crash
        static foreach(i; IncrementRange(T.sizeof))
        {
            pragma(inline, true)
            Dmemcpy(
                cast(S!(i.size)*)(cast(ubyte*)dst + i.offset),
                cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset));
        }
        return;
    }
    else static if (T.sizeof == 128)
    {
        // pragma(inline, true)
        import core.simd;
        prefetch!(false, 3)(src+0x1a0);
        prefetch!(false, 3)(src+0x260);
        static foreach(i; 0 .. T.sizeof/16)
        {
            storeUnaligned(cast(void16*)(dst) + i, loadUnaligned(cast(const void16*)(src) + i));
        }
    }
    // Sizes 209 ~ 256 perform poorly in this block for unknown reasons
    else static if (T.sizeof < 209 && !isPowerOf2(T.sizeof))
    {
        // pragma(inline, true)
        static foreach(i; IncrementRange(T.sizeof))
        {
            Dmemcpy(
                cast(S!(i.size)*)(cast(ubyte*)dst + i.offset),
                cast(const S!(i.size)*)(cast(const ubyte*)src + i.offset));
        }
        return;
    }
    else
    {
        pragma(inline, false)
        asm pure nothrow @nogc
        {
            mov    RDX, T.sizeof;
            mov    ECX, ESI;                       // save `src`
            and    ECX, 0x1f;                      // mod = src % 32
            je     L4;
            // if (mod) -> copy enough bytes to reach 32-byte alignment
            vmovdqu YMM0, [RSI];
            vmovdqu [RDI], YMM0;
            // %t0 = 32 - mod
            mov    RAX, 0x20;
            sub    RAX, RCX;
            //cdqe   ;
            // src += %t0
            add    RSI, RAX;
            // dst += %t0
            add    RDI, RAX;
            // n -= %t0
            sub    RDX, RAX;
        align 16;
        L4:
            // Because of the above, (at least) the loads
            // are 32-byte aligned.
            vmovdqu YMM0, [RSI];
            vmovdqu YMM1, [RSI+0x20];
            vmovdqu YMM2, [RSI+0x40];
            vmovdqu YMM3, [RSI+0x60];
            vmovdqu [RDI], YMM0;
            vmovdqu [RDI+0x20], YMM1;
            vmovdqu [RDI+0x40], YMM2;
            vmovdqu [RDI+0x60], YMM3;
            // src += 128;
            add    RSI, 128;
            // dst += 128;
            add    RDI, 128;
            // n -= 128;
            sub    RDX, 128;
            // if (n >= 128) loop
            cmp    RDX, 128;
            jge    L4;
        L2:
            test   RDX, RDX;
            je     L3;
            // if (n != 0)  -> copy the remaining <= 128 bytes
            vmovdqu YMM0, [RSI];
            vmovdqu YMM1, [RSI+0x20];
            vmovdqu [RDI], YMM0;
            vmovdqu [RDI+0x20], YMM1;
            sub     RDX, 0x40;
            add     RSI, RDX;
            add     RDI, RDX;
            vmovdqu YMM0, [RSI];
            vmovdqu YMM1, [RSI+0x20];
            vmovdqu [RDI], YMM0;
            vmovdqu [RDI+0x20], YMM1;
        L3:
            vzeroupper;
        }
        return;
    }
}
	/*
	Boost Software License - Version 1.0 - August 17th, 2003
	Permission is hereby granted, free of charge, to any person or organization
	obtaining a copy of the software and accompanying documentation covered by
	this license (the "Software") to use, reproduce, display, distribute,
	execute, and transmit the Software, and to prepare derivative works of the
	Software, and to permit third-parties to whom the Software is furnished to
	do so, all subject to the following:
	The copyright notices in the Software and this entire statement, including
	the above license grant, this restriction and the following disclaimer,
	must be included in all copies of the Software, in whole or in part, and
	all derivative works of the Software, unless such copies or derivative
	works are solely in the form of machine-executable object code generated by
	a source language processor.
	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
	SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
	FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
	ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	DEALINGS IN THE SOFTWARE.
	*/

	import std.datetime.stopwatch;
	import core.stdc.string;
	import S_struct;
	import std.random;
	import std.traits;
	import std.stdio;


	bool isPowerOf2(T)(T x)
	if (isIntegral!T)
	{
	return (x != 0) && ((x & (x - 1)) == 0);
	}

	struct Increment
	{
	size_t size;
	size_t offset;
	}

	struct IncrementRange
	{
	Increment i;
	size_t remainingSize;
	size_t blockSize;

	this(size_t n)
	{
	blockSize = n;
	remainingSize = n;
	popFront();
	}

	@property bool empty()
	{
	return i.size <= 0;
	}

	@property Increment front()
	{
	return i;
	}

	void popFront()
	{
	import core.bitop: bsr;

	// NOTE: This algorithm in naive. It should be improved.

	i.offset += i.size;
	if (remainingSize != 0)
	{
	i.size = 1LU << bsr(remainingSize);
	}
	else
	{
	i.size = 0;
	return;
	}

	// Example: For size 7
	// Instead of copying..
	// 4 bytes at offset 0
	// 2 bytes at offset 4
	// 1 byte at offset 6
	// copy..
	// 4 bytes at offset 0
	// 4 bytes at offset 3
	// Some bytes are copied twice, but its faster

	if (blockSize > 4 && remainingSize < blockSize && i.size < 4)
	{
	i.size = 4;
	i.offset = blockSize - 4;
	remainingSize = 0;
	}
	else if (blockSize > 8 && remainingSize < blockSize && i.size < 8)
	{
	i.size = 8;
	i.offset = blockSize - 8;
	remainingSize = 0;
	}
	else if (blockSize > 16 && remainingSize < blockSize && i.size < 16)
	{
	i.size = 16;
	i.offset = blockSize - 16;
	remainingSize = 0;
	}
	else
	{
	remainingSize -= i.size;
	}
	}
	}

	void Cmemcpy(T)(T dst, const T src)
	{
	pragma(inline, true)
	memcpy(dst, src, T.sizeof);
	}

	void Dmemcpy(T)(T dst, const T src)
	if (isScalarType!T)
	{
	pragma(inline, true)
	dst = src;
	}

	void Dmemcpy(T)(T dst, const T src)
	if (is(T == struct))
	{
	static if (T.sizeof == 1)
	{
	pragma(inline, true)
	Dmemcpy(cast(ubyte)(dst), cast(const ubyte)(src));
	return;
	}
	else static if (T.sizeof == 2)
	{
	pragma(inline, true)
	Dmemcpy(cast(ushort)(dst), cast(const ushort)(src));
	return;
	}
	else static if (T.sizeof == 4)
	{
	pragma(inline, true)
	Dmemcpy(cast(uint)(dst), cast(const uint)(src));
	return;
	}
	else static if (T.sizeof == 8)
	{
	pragma(inline, true)
	Dmemcpy(cast(ulong)(dst), cast(const ulong)(src));
	return;
	}
	else static if (T.sizeof == 16)
	{
	pragma(inline, true)
	import core.simd: void16, storeUnaligned, loadUnaligned;
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(const void16)(src)));
	return;
	}
	else static if (T.sizeof <= 64 && (T.sizeof % 16) == 0)
	{
	pragma(inline, true)
	import core.simd: void16, storeUnaligned, loadUnaligned;
	static foreach(i; 0 .. T.sizeof/16)
	{
	storeUnaligned(cast(void16)(dst) + i, loadUnaligned(cast(const void16)(src) + i));
	}
	return;
	}
	// Could not get this to inline for greater than 88
	else static if (T.sizeof < 88 && !isPowerOf2(T.sizeof))
	{
	// pragma(inline, true) // DMD BUG! causes compiler to crash
	static foreach(i; IncrementRange(T.sizeof))
	{
	pragma(inline, true)
	Dmemcpy(
	cast(S!(i.size))(cast(ubyte)dst + i.offset),
	cast(const S!(i.size))(cast(const ubyte)src + i.offset));
	}
	return;
	}
	else static if (T.sizeof == 128)
	{
	// pragma(inline, true)
	import core.simd;
	prefetch!(false, 3)(src+0x1a0);
	prefetch!(false, 3)(src+0x260);
	static foreach(i; 0 .. T.sizeof/16)
	{
	storeUnaligned(cast(void16)(dst) + i, loadUnaligned(cast(const void16)(src) + i));
	}
	}
	// Sizes 209 ~ 256 perform poorly in this block for unknown reasons
	else static if (T.sizeof < 209 && !isPowerOf2(T.sizeof))
	{
	// pragma(inline, true)
	static foreach(i; IncrementRange(T.sizeof))
	{
	Dmemcpy(
	cast(S!(i.size))(cast(ubyte)dst + i.offset),
	cast(const S!(i.size))(cast(const ubyte)src + i.offset));
	}
	return;
	}
	else
	{
	pragma(inline, false)
	asm pure nothrow @nogc
	{
	mov RDX, T.sizeof;
	mov ECX, ESI; // save `src`
	and ECX, 0x1f; // mod = src % 32
	je L4;
	// if (mod) -> copy enough bytes to reach 32-byte alignment
	vmovdqu YMM0, [RSI];
	vmovdqu [RDI], YMM0;
	// %t0 = 32 - mod
	mov RAX, 0x20;
	sub RAX, RCX;
	//cdqe ;
	// src += %t0
	add RSI, RAX;
	// dst += %t0
	add RDI, RAX;
	// n -= %t0
	sub RDX, RAX;
	align 16;
	L4:
	// Because of the above, (at least) the loads
	// are 32-byte aligned.
	vmovdqu YMM0, [RSI];
	vmovdqu YMM1, [RSI+0x20];
	vmovdqu YMM2, [RSI+0x40];
	vmovdqu YMM3, [RSI+0x60];
	vmovdqu [RDI], YMM0;
	vmovdqu [RDI+0x20], YMM1;
	vmovdqu [RDI+0x40], YMM2;
	vmovdqu [RDI+0x60], YMM3;
	// src += 128;
	add RSI, 128;
	// dst += 128;
	add RDI, 128;
	// n -= 128;
	sub RDX, 128;
	// if (n >= 128) loop
	cmp RDX, 128;
	jge L4;
	L2:
	test RDX, RDX;
	je L3;
	// if (n != 0) -> copy the remaining <= 128 bytes
	vmovdqu YMM0, [RSI];
	vmovdqu YMM1, [RSI+0x20];
	vmovdqu [RDI], YMM0;
	vmovdqu [RDI+0x20], YMM1;
	sub RDX, 0x40;
	add RSI, RDX;
	add RDI, RDX;
	vmovdqu YMM0, [RSI];
	vmovdqu YMM1, [RSI+0x20];
	vmovdqu [RDI], YMM0;
	vmovdqu [RDI+0x20], YMM1;
	L3:
	vzeroupper;
	}
	return;
	}
	}