JinShil/rte_memcpy.d

## rte_memcpy.d
import core.simd: void16, loadUnaligned, storeUnaligned;

pragma(inline, true)
void rte_mov16(ubyte *dst, ubyte*src) {
    storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
}

pragma(inline, true)
void rte_mov32(ubyte *dst, ubyte *src) {
    storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
    storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
}

pragma(inline, true)
void rte_mov48(ubyte *dst, ubyte *src) {
    storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
    storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
    storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
}

pragma(inline, true)
void rte_mov64(ubyte *dst, ubyte *src) {
    storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
    storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
    storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
    storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48)));
}

pragma(inline, true)
void rte_mov128(ubyte *dst, ubyte *src) {
    storeUnaligned(cast(void16*)(dst), loadUnaligned(cast(void16*)(src)));
    storeUnaligned(cast(void16*)(dst + 16), loadUnaligned(cast(void16*)(src + 16)));
    storeUnaligned(cast(void16*)(dst + 32), loadUnaligned(cast(void16*)(src + 32)));
    storeUnaligned(cast(void16*)(dst + 48), loadUnaligned(cast(void16*)(src + 48)));
    storeUnaligned(cast(void16*)(dst + 64), loadUnaligned(cast(void16*)(src + 64)));
    storeUnaligned(cast(void16*)(dst + 80), loadUnaligned(cast(void16*)(src + 80)));
    storeUnaligned(cast(void16*)(dst + 96), loadUnaligned(cast(void16*)(src + 96)));
    storeUnaligned(cast(void16*)(dst + 112), loadUnaligned(cast(void16*)(src + 112)));
}

pragma(inline, false)
void rte_mov256(ubyte *dst, ubyte *src) {
	rte_mov128(dst, src);
	rte_mov128(dst + 128, src + 128);
}

void *rte_memcpy_func(void *dst, void *src, size_t n) {
	void *ret = dst;

	/* We can't copy < 16 bytes using XMM registers so do it manually. */
	if (n < 16) {
		if (n & 0x01) {
			*cast(ubyte *)dst = *cast(ubyte *)src;
			dst = cast(ubyte *)dst + 1;
			src = cast(ubyte *)src + 1;
		}
		if (n & 0x02) {
			*cast(ushort *)dst = *cast(ushort *)src;
			dst = cast(ushort *)dst + 1;
			src = cast(ushort *)src + 1;
		}
		if (n & 0x04) {
			*cast(uint *)dst = *cast(uint *)src;
			dst = cast(uint *)dst + 1;
			src = cast(uint *)src + 1;
		}
		if (n & 0x08) {
			*cast(ulong *)dst = *cast(ulong *)src;
		}
		return ret;
	}

	/* Special fast cases for <= 128 bytes */
	if (n <= 32) {
		rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
		rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n);
		return ret;
	}

	if (n <= 64) {
		rte_mov32(cast(ubyte *)dst, cast(ubyte *)src);
		rte_mov32(cast(ubyte *)dst - 32 + n, cast(ubyte *)src - 32 + n);
		return ret;
	}

	if (n <= 128) {
		rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
		rte_mov64(cast(ubyte *)dst - 64 + n, cast(ubyte *)src - 64 + n);
		return ret;
	}

	/*
	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
	 * copies was found to be faster than doing 128 and 32 byte copies as
	 * well.
	 */
	for ( ; n >= 256; n -= 256) {
		rte_mov256(cast(ubyte *)dst, cast(ubyte *)src);
		dst = cast(ubyte *)dst + 256;
		src = cast(ubyte *)src + 256;
	}

	/*
	 * We split the remaining bytes (which will be less than 256) into
	 * 64byte (2^6) chunks.
	 * Using incrementing integers in the case labels of a switch statement
	 * enourages the compiler to use a jump table. To get incrementing
	 * integers, we shift the 2 relevant bits to the LSB position to first
	 * get decrementing integers, and then subtract.
	 */
	switch (3 - (n >> 6)) {
	case 0x00:
		rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 64;
		dst = cast(ubyte *)dst + 64;
		src = cast(ubyte *)src + 64;      /* fallthrough */
	case 0x01:
		rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 64;
		dst = cast(ubyte *)dst + 64;
		src = cast(ubyte *)src + 64;      /* fallthrough */
	case 0x02:
		rte_mov64(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 64;
		dst = cast(ubyte *)dst + 64;
		src = cast(ubyte *)src + 64;      /* fallthrough */
	default: break;
	}

	/*
	 * We split the remaining bytes (which will be less than 64) into
	 * 16byte (2^4) chunks, using the same switch structure as above.
	 */
	switch (3 - (n >> 4)) {
	case 0x00:
		rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 16;
		dst = cast(ubyte *)dst + 16;
		src = cast(ubyte *)src + 16;      /* fallthrough */
	case 0x01:
		rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 16;
		dst = cast(ubyte *)dst + 16;
		src = cast(ubyte *)src + 16;      /* fallthrough */
	case 0x02:
		rte_mov16(cast(ubyte *)dst, cast(ubyte *)src);
		n -= 16;
		dst = cast(ubyte *)dst + 16;
		src = cast(ubyte *)src + 16;      /* fallthrough */
	default: break;
	}

	/* Copy any remaining bytes, without going beyond end of buffers */
	if (n != 0) {
		rte_mov16(cast(ubyte *)dst - 16 + n, cast(ubyte *)src - 16 + n);
	}
	return ret;
}
	import core.simd: void16, loadUnaligned, storeUnaligned;

	pragma(inline, true)
	void rte_mov16(ubyte dst, ubytesrc) {
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(void16)(src)));
	}

	pragma(inline, true)
	void rte_mov32(ubyte dst, ubyte src) {
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(void16)(src)));
	storeUnaligned(cast(void16)(dst + 16), loadUnaligned(cast(void16)(src + 16)));
	}

	pragma(inline, true)
	void rte_mov48(ubyte dst, ubyte src) {
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(void16)(src)));
	storeUnaligned(cast(void16)(dst + 16), loadUnaligned(cast(void16)(src + 16)));
	storeUnaligned(cast(void16)(dst + 32), loadUnaligned(cast(void16)(src + 32)));
	}

	pragma(inline, true)
	void rte_mov64(ubyte dst, ubyte src) {
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(void16)(src)));
	storeUnaligned(cast(void16)(dst + 16), loadUnaligned(cast(void16)(src + 16)));
	storeUnaligned(cast(void16)(dst + 32), loadUnaligned(cast(void16)(src + 32)));
	storeUnaligned(cast(void16)(dst + 48), loadUnaligned(cast(void16)(src + 48)));
	}

	pragma(inline, true)
	void rte_mov128(ubyte dst, ubyte src) {
	storeUnaligned(cast(void16)(dst), loadUnaligned(cast(void16)(src)));
	storeUnaligned(cast(void16)(dst + 16), loadUnaligned(cast(void16)(src + 16)));
	storeUnaligned(cast(void16)(dst + 32), loadUnaligned(cast(void16)(src + 32)));
	storeUnaligned(cast(void16)(dst + 48), loadUnaligned(cast(void16)(src + 48)));
	storeUnaligned(cast(void16)(dst + 64), loadUnaligned(cast(void16)(src + 64)));
	storeUnaligned(cast(void16)(dst + 80), loadUnaligned(cast(void16)(src + 80)));
	storeUnaligned(cast(void16)(dst + 96), loadUnaligned(cast(void16)(src + 96)));
	storeUnaligned(cast(void16)(dst + 112), loadUnaligned(cast(void16)(src + 112)));
	}

	pragma(inline, false)
	void rte_mov256(ubyte dst, ubyte src) {
	rte_mov128(dst, src);
	rte_mov128(dst + 128, src + 128);
	}

	void rte_memcpy_func(void dst, void *src, size_t n) {
	void *ret = dst;

	/* We can't copy < 16 bytes using XMM registers so do it manually. */
	if (n < 16) {
	if (n & 0x01) {
	cast(ubyte )dst = cast(ubyte )src;
	dst = cast(ubyte *)dst + 1;
	src = cast(ubyte *)src + 1;
	}
	if (n & 0x02) {
	cast(ushort )dst = cast(ushort )src;
	dst = cast(ushort *)dst + 1;
	src = cast(ushort *)src + 1;
	}
	if (n & 0x04) {
	cast(uint )dst = cast(uint )src;
	dst = cast(uint *)dst + 1;
	src = cast(uint *)src + 1;
	}
	if (n & 0x08) {
	cast(ulong )dst = cast(ulong )src;
	}
	return ret;
	}

	/* Special fast cases for <= 128 bytes */
	if (n <= 32) {
	rte_mov16(cast(ubyte )dst, cast(ubyte )src);
	rte_mov16(cast(ubyte )dst - 16 + n, cast(ubyte )src - 16 + n);
	return ret;
	}

	if (n <= 64) {
	rte_mov32(cast(ubyte )dst, cast(ubyte )src);
	rte_mov32(cast(ubyte )dst - 32 + n, cast(ubyte )src - 32 + n);
	return ret;
	}

	if (n <= 128) {
	rte_mov64(cast(ubyte )dst, cast(ubyte )src);
	rte_mov64(cast(ubyte )dst - 64 + n, cast(ubyte )src - 64 + n);
	return ret;
	}

	/*
	* For large copies > 128 bytes. This combination of 256, 64 and 16 byte
	* copies was found to be faster than doing 128 and 32 byte copies as
	* well.
	*/
	for ( ; n >= 256; n -= 256) {
	rte_mov256(cast(ubyte )dst, cast(ubyte )src);
	dst = cast(ubyte *)dst + 256;
	src = cast(ubyte *)src + 256;
	}

	/*
	* We split the remaining bytes (which will be less than 256) into
	* 64byte (2^6) chunks.
	* Using incrementing integers in the case labels of a switch statement
	* enourages the compiler to use a jump table. To get incrementing
	* integers, we shift the 2 relevant bits to the LSB position to first
	* get decrementing integers, and then subtract.
	*/
	switch (3 - (n >> 6)) {
	case 0x00:
	rte_mov64(cast(ubyte )dst, cast(ubyte )src);
	n -= 64;
	dst = cast(ubyte *)dst + 64;
	src = cast(ubyte )src + 64; / fallthrough */
	case 0x01:
	rte_mov64(cast(ubyte )dst, cast(ubyte )src);
	n -= 64;
	dst = cast(ubyte *)dst + 64;
	src = cast(ubyte )src + 64; / fallthrough */
	case 0x02:
	rte_mov64(cast(ubyte )dst, cast(ubyte )src);
	n -= 64;
	dst = cast(ubyte *)dst + 64;
	src = cast(ubyte )src + 64; / fallthrough */
	default: break;
	}

	/*
	* We split the remaining bytes (which will be less than 64) into
	* 16byte (2^4) chunks, using the same switch structure as above.
	*/
	switch (3 - (n >> 4)) {
	case 0x00:
	rte_mov16(cast(ubyte )dst, cast(ubyte )src);
	n -= 16;
	dst = cast(ubyte *)dst + 16;
	src = cast(ubyte )src + 16; / fallthrough */
	case 0x01:
	rte_mov16(cast(ubyte )dst, cast(ubyte )src);
	n -= 16;
	dst = cast(ubyte *)dst + 16;
	src = cast(ubyte )src + 16; / fallthrough */
	case 0x02:
	rte_mov16(cast(ubyte )dst, cast(ubyte )src);
	n -= 16;
	dst = cast(ubyte *)dst + 16;
	src = cast(ubyte )src + 16; / fallthrough */
	default: break;
	}

	/* Copy any remaining bytes, without going beyond end of buffers */
	if (n != 0) {
	rte_mov16(cast(ubyte )dst - 16 + n, cast(ubyte )src - 16 + n);
	}
	return ret;
	}