Created
March 10, 2019 17:30
-
-
Save tannergooding/91517e4f5a51eb2fcbea731140cc83fe to your computer and use it in GitHub Desktop.
Simple benchmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using BenchmarkDotNet.Attributes; | |
using nuint = System.UInt64; | |
namespace ConsoleApp3 | |
{ | |
public unsafe class Memcpy | |
{ | |
public const uint BufferSize = (1024 * 1024 * 100) + 1; // 100MB + 1 byte | |
private byte[] source = new byte[BufferSize + (32 - (BufferSize % 32))]; | |
private byte[] destination = new byte[BufferSize + (32 - (BufferSize % 32))]; | |
public static ReadOnlySpan<byte> LeadingAlignmentMask => new byte[] | |
{ | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
}; | |
public static ReadOnlySpan<byte> TrailingAlignmentMask => new byte[] | |
{ | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | |
}; | |
private Stopwatch stopwatch = new Stopwatch(); | |
public Memcpy() | |
{ | |
var rng = new Random(); | |
rng.NextBytes(source); | |
rng.NextBytes(destination); | |
} | |
[Benchmark] | |
public void UnsafeCopyBlock() | |
{ | |
ValidateNotEqual(); | |
fixed (byte* pSrc = &source[0]) | |
fixed (byte* pDst = &destination[0]) | |
{ | |
Unsafe.CopyBlock(pDst, pSrc, BufferSize); | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void UnsafeCopyBlockUnaligned() | |
{ | |
ValidateNotEqual(); | |
fixed (byte* pSrc = &source[0]) | |
fixed (byte* pDst = &destination[0]) | |
{ | |
Unsafe.CopyBlockUnaligned(pDst, pSrc, BufferSize); | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void BufferMemoryCopy() | |
{ | |
ValidateNotEqual(); | |
fixed (byte* pSrc = &source[0]) | |
fixed (byte* pDst = &destination[0]) | |
{ | |
Buffer.MemoryCopy(pSrc, pDst, BufferSize, BufferSize); | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void Naive() | |
{ | |
ValidateNotEqual(); | |
fixed (byte* pSrc = &source[0]) | |
fixed (byte* pDst = &destination[0]) | |
{ | |
for (int i = 0; i < BufferSize; i++) | |
{ | |
pDst[i] = pSrc[i]; | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedLoad128() | |
{ | |
const nuint BlockSize = 16; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pSrc); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Sse2.LoadAlignedVector128(pSrc); | |
Sse2.Store(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedStore128() | |
{ | |
const nuint BlockSize = 16; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pDst); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.StoreAligned(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedStoreNonTemporal128() | |
{ | |
const nuint BlockSize = 16; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pDst); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.StoreAlignedNonTemporal(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Sse2.LoadVector128(pSrc); | |
Sse2.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void Unaligned128() | |
{ | |
const nuint BlockSize = 16; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint remainder = 0; | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Sse2.LoadVector128(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
nuint misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Sse2.LoadVector128(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedLoad256() | |
{ | |
const nuint BlockSize = 32; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pSrc); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Avx.LoadAlignedVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedStore256() | |
{ | |
const nuint BlockSize = 32; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pDst); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.StoreAligned(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void AlignedStoreNonTemporal256() | |
{ | |
const nuint BlockSize = 32; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint address = (nuint)(pDst); | |
nuint misalignment = BlockSize - (address % BlockSize); | |
nuint remainder = 0; | |
if (misalignment != 0) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
pSrc += misalignment; | |
pDst += misalignment; | |
length -= misalignment; | |
} | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.StoreAlignedNonTemporal(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Benchmark] | |
public void Unaligned256() | |
{ | |
const nuint BlockSize = 32; | |
ValidateNotEqual(); | |
fixed (byte* pSource = &source[0]) | |
fixed (byte* pDestination = &destination[0]) | |
{ | |
byte* pSrc = pSource; | |
byte* pDst = pDestination; | |
nuint length = BufferSize; | |
nuint remainder = 0; | |
if (length >= BlockSize) | |
{ | |
remainder = length % BlockSize; | |
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize) | |
{ | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
else | |
{ | |
remainder = length; | |
} | |
if (remainder != 0) | |
{ | |
nuint misalignment = BlockSize - remainder; | |
pSrc -= misalignment; | |
pDst -= misalignment; | |
var block = Avx.LoadVector256(pSrc); | |
Avx.Store(pDst, block); | |
} | |
} | |
ValidateEqual(); | |
} | |
[Conditional("DEBUG")] | |
public void ValidateNotEqual([CallerMemberName] string callerMemberName = "") | |
{ | |
Console.Write(callerMemberName); | |
var rng = new Random(); | |
rng.NextBytes(source); | |
for (uint i = BufferSize; i < source.Length; i++) | |
{ | |
source[i] = 0xAA; | |
} | |
rng.NextBytes(destination); | |
for (uint i = BufferSize; i < destination.Length; i++) | |
{ | |
destination[i] = 0xBB; | |
} | |
for (int i = 0; i < BufferSize; i++) | |
{ | |
if (source[i] != destination[i]) | |
{ | |
stopwatch.Start(); | |
return; | |
} | |
} | |
throw new Exception("Data should not be equal"); | |
} | |
[Conditional("DEBUG")] | |
public void ValidateEqual() | |
{ | |
stopwatch.Stop(); | |
Console.WriteLine($" | {stopwatch.ElapsedMilliseconds} ms"); | |
stopwatch.Reset(); | |
for (uint i = 0; i < BufferSize; i++) | |
{ | |
if (source[i] != destination[i]) | |
{ | |
throw new Exception("Data should be equal"); | |
} | |
} | |
for (uint i = BufferSize; i < source.Length; i++) | |
{ | |
if (source[i] != 0xAA) | |
{ | |
throw new Exception("Data should be 0xAA"); | |
} | |
} | |
for (uint i = BufferSize; i < destination.Length; i++) | |
{ | |
if (destination[i] != 0xBB) | |
{ | |
throw new Exception("Data should be 0xBB"); | |
} | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkDotNet.Running; | |
namespace ConsoleApp3 | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
#if DEBUG | |
var memcpy = new Memcpy(); | |
memcpy.UnsafeCopyBlock(); | |
memcpy.UnsafeCopyBlockUnaligned(); | |
memcpy.BufferMemoryCopy(); | |
memcpy.Naive(); | |
memcpy.AlignedLoad128(); | |
memcpy.AlignedStore128(); | |
memcpy.AlignedStoreNonTemporal128(); | |
memcpy.Unaligned128(); | |
memcpy.AlignedLoad256(); | |
memcpy.AlignedStore256(); | |
memcpy.AlignedStoreNonTemporal256(); | |
memcpy.Unaligned256(); | |
#else | |
var summary = BenchmarkRunner.Run<Memcpy>(); | |
#endif | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment