Skip to content

Instantly share code, notes, and snippets.

@tannergooding
Created March 10, 2019 17:30
Show Gist options
  • Save tannergooding/91517e4f5a51eb2fcbea731140cc83fe to your computer and use it in GitHub Desktop.
Save tannergooding/91517e4f5a51eb2fcbea731140cc83fe to your computer and use it in GitHub Desktop.
Simple benchmarks
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using nuint = System.UInt64;
namespace ConsoleApp3
{
public unsafe class Memcpy
{
public const uint BufferSize = (1024 * 1024 * 100) + 1; // 100MB + 1 byte
private byte[] source = new byte[BufferSize + (32 - (BufferSize % 32))];
private byte[] destination = new byte[BufferSize + (32 - (BufferSize % 32))];
public static ReadOnlySpan<byte> LeadingAlignmentMask => new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
public static ReadOnlySpan<byte> TrailingAlignmentMask => new byte[]
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
private Stopwatch stopwatch = new Stopwatch();
public Memcpy()
{
var rng = new Random();
rng.NextBytes(source);
rng.NextBytes(destination);
}
[Benchmark]
public void UnsafeCopyBlock()
{
ValidateNotEqual();
fixed (byte* pSrc = &source[0])
fixed (byte* pDst = &destination[0])
{
Unsafe.CopyBlock(pDst, pSrc, BufferSize);
}
ValidateEqual();
}
[Benchmark]
public void UnsafeCopyBlockUnaligned()
{
ValidateNotEqual();
fixed (byte* pSrc = &source[0])
fixed (byte* pDst = &destination[0])
{
Unsafe.CopyBlockUnaligned(pDst, pSrc, BufferSize);
}
ValidateEqual();
}
[Benchmark]
public void BufferMemoryCopy()
{
ValidateNotEqual();
fixed (byte* pSrc = &source[0])
fixed (byte* pDst = &destination[0])
{
Buffer.MemoryCopy(pSrc, pDst, BufferSize, BufferSize);
}
ValidateEqual();
}
[Benchmark]
public void Naive()
{
ValidateNotEqual();
fixed (byte* pSrc = &source[0])
fixed (byte* pDst = &destination[0])
{
for (int i = 0; i < BufferSize; i++)
{
pDst[i] = pSrc[i];
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedLoad128()
{
const nuint BlockSize = 16;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pSrc);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Sse2.LoadAlignedVector128(pSrc);
Sse2.Store(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedStore128()
{
const nuint BlockSize = 16;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pDst);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Sse2.LoadVector128(pSrc);
Sse2.StoreAligned(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedStoreNonTemporal128()
{
const nuint BlockSize = 16;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pDst);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Sse2.LoadVector128(pSrc);
Sse2.StoreAlignedNonTemporal(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Sse2.LoadVector128(pSrc);
Sse2.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void Unaligned128()
{
const nuint BlockSize = 16;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint remainder = 0;
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Sse2.LoadVector128(pSrc);
Avx.Store(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
nuint misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Sse2.LoadVector128(pSrc);
Avx.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedLoad256()
{
const nuint BlockSize = 32;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pSrc);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Avx.LoadAlignedVector256(pSrc);
Avx.Store(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedStore256()
{
const nuint BlockSize = 32;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pDst);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Avx.LoadVector256(pSrc);
Avx.StoreAligned(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void AlignedStoreNonTemporal256()
{
const nuint BlockSize = 32;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint address = (nuint)(pDst);
nuint misalignment = BlockSize - (address % BlockSize);
nuint remainder = 0;
if (misalignment != 0)
{
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
pSrc += misalignment;
pDst += misalignment;
length -= misalignment;
}
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Avx.LoadVector256(pSrc);
Avx.StoreAlignedNonTemporal(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
}
}
ValidateEqual();
}
[Benchmark]
public void Unaligned256()
{
const nuint BlockSize = 32;
ValidateNotEqual();
fixed (byte* pSource = &source[0])
fixed (byte* pDestination = &destination[0])
{
byte* pSrc = pSource;
byte* pDst = pDestination;
nuint length = BufferSize;
nuint remainder = 0;
if (length >= BlockSize)
{
remainder = length % BlockSize;
for (byte* pEnd = pDst + (length - remainder); pDst < pEnd; pDst += BlockSize, pSrc += BlockSize)
{
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
}
}
else
{
remainder = length;
}
if (remainder != 0)
{
nuint misalignment = BlockSize - remainder;
pSrc -= misalignment;
pDst -= misalignment;
var block = Avx.LoadVector256(pSrc);
Avx.Store(pDst, block);
}
}
ValidateEqual();
}
[Conditional("DEBUG")]
public void ValidateNotEqual([CallerMemberName] string callerMemberName = "")
{
Console.Write(callerMemberName);
var rng = new Random();
rng.NextBytes(source);
for (uint i = BufferSize; i < source.Length; i++)
{
source[i] = 0xAA;
}
rng.NextBytes(destination);
for (uint i = BufferSize; i < destination.Length; i++)
{
destination[i] = 0xBB;
}
for (int i = 0; i < BufferSize; i++)
{
if (source[i] != destination[i])
{
stopwatch.Start();
return;
}
}
throw new Exception("Data should not be equal");
}
[Conditional("DEBUG")]
public void ValidateEqual()
{
stopwatch.Stop();
Console.WriteLine($" | {stopwatch.ElapsedMilliseconds} ms");
stopwatch.Reset();
for (uint i = 0; i < BufferSize; i++)
{
if (source[i] != destination[i])
{
throw new Exception("Data should be equal");
}
}
for (uint i = BufferSize; i < source.Length; i++)
{
if (source[i] != 0xAA)
{
throw new Exception("Data should be 0xAA");
}
}
for (uint i = BufferSize; i < destination.Length; i++)
{
if (destination[i] != 0xBB)
{
throw new Exception("Data should be 0xBB");
}
}
}
}
}
using BenchmarkDotNet.Running;
namespace ConsoleApp3
{
class Program
{
static void Main(string[] args)
{
#if DEBUG
var memcpy = new Memcpy();
memcpy.UnsafeCopyBlock();
memcpy.UnsafeCopyBlockUnaligned();
memcpy.BufferMemoryCopy();
memcpy.Naive();
memcpy.AlignedLoad128();
memcpy.AlignedStore128();
memcpy.AlignedStoreNonTemporal128();
memcpy.Unaligned128();
memcpy.AlignedLoad256();
memcpy.AlignedStore256();
memcpy.AlignedStoreNonTemporal256();
memcpy.Unaligned256();
#else
var summary = BenchmarkRunner.Run<Memcpy>();
#endif
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment