Skip to content

Instantly share code, notes, and snippets.

@antiduh
Last active February 8, 2018 18:50
Show Gist options
  • Save antiduh/aa111df5c6ebb9fd75a5bbc04afce1e3 to your computer and use it in GitHub Desktop.
Save antiduh/aa111df5c6ebb9fd75a5bbc04afce1e3 to your computer and use it in GitHub Desktop.
Fastest byte array merge for reddit, https://www.reddit.com/r/csharp/comments/7w09vc/
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Numerics;
using System.Threading;
namespace MergeBenchmark
{
internal static class Program
{
/// <summary>
/// The main entry point for the application.
/// </summary>
[STAThread]
private static void Main()
{
/*
Results:
Naive : 1074.0 ms, 878.7 mbytes/sec
ThreadedNaive_MaxThreads : 262.0 ms, 3602.0 mbytes/sec
ThreadedNaive_NoHyperThreads : 327.0 ms, 2886.0 mbytes/sec
Unrolled : 805.0 ms, 1172.3 mbytes/sec
ThreadedUnrolled_MaxThreads : 263.0 ms, 3588.3 mbytes/sec
ThreadedUnrolled_NoHyperthreads : 322.0 ms, 2930.8 mbytes/sec
Vector : 210.0 ms, 4493.9 mbytes/sec
ThreadedVectorize_MaxThreads : 177.0 ms, 5331.7 mbytes/sec
ThreadedVectorize_NoHyperthreads : 181.0 ms, 5213.9 mbytes/sec
*/
if( Vector.IsHardwareAccelerated == false )
{
throw new InvalidOperationException( "Invalid benchmark" );
}
Process.GetCurrentProcess().PriorityClass = ProcessPriorityClass.High;
int numBytes = 900 * 1024 * 1024;
byte[] data = new byte[ numBytes ];
byte[] mask = new byte[ numBytes ];
byte[] result = new byte[ numBytes ];
FillData( data, mask );
TimeMerger( "Naive", numBytes, () => Naive( data, mask, result ) );
TimeMerger( "ThreadedNaive_MaxThreads", numBytes, () => ThreadedNaive_MaxThreads( data, mask, result ) );
TimeMerger( "ThreadedNaive_NoHyperThreads", numBytes, () => ThreadedNaive_NoHyperThreads( data, mask, result ) );
Console.WriteLine();
TimeMerger( "Unrolled", numBytes, () => Unrolled( data, mask, result ) );
TimeMerger( "ThreadedUnrolled_MaxThreads", numBytes, () => ThreadedUnrolled_MaxThreads( data, mask, result ) );
TimeMerger( "ThreadedUnrolled_NoHyperthreads", numBytes, () => ThreadedUnrolled_NoHyperthreads( data, mask, result ) );
Console.WriteLine();
TimeMerger( "Vector", numBytes, () => Vectorized( data, mask, result ) );
TimeMerger( "ThreadedVectorize_MaxThreads", numBytes, () => ThreadedVectorize_MaxThreads( data, mask, result ) );
TimeMerger( "ThreadedVectorize_NoHyperthreads", numBytes, () => ThreadedVectorize_NoHyperthreads( data, mask, result ) );
Console.WriteLine( "Press any key to continue..." );
Console.ReadLine();
}
private static void FillData( byte[] data, byte[] mask )
{
for( int i = 0; i < mask.Length; i += 10 )
{
data[i + 0] = (byte)( i + 0 );
data[i + 1] = (byte)( i + 1 );
data[i + 2] = (byte)( i + 2 );
data[i + 3] = (byte)( i + 3 );
data[i + 4] = (byte)( i + 4 );
data[i + 5] = (byte)( i + 5 );
data[i + 6] = (byte)( i + 6 );
data[i + 7] = (byte)( i + 7 );
data[i + 8] = (byte)( i + 8 );
data[i + 9] = (byte)( i + 9 );
}
for( int i = 0; i < mask.Length; i += 10 )
{
mask[i + 0] = 42;
mask[i + 1] = 0;
mask[i + 2] = 0;
mask[i + 3] = 0;
mask[i + 4] = 0;
mask[i + 5] = 0;
mask[i + 6] = 0;
mask[i + 7] = 0;
mask[i + 8] = 0;
mask[i + 9] = 0;
}
}
private static void TimeMerger( string mergerName, int numBytes, Action merger )
{
Stopwatch watch = new Stopwatch();
int numRepeats = 5;
// Prime the merge impl to ensure JIT is already done.
merger();
watch.Start();
for( int i = 0; i < numRepeats; i++ )
{
merger();
}
watch.Stop();
long millisecsPerRepeat = watch.ElapsedMilliseconds / numRepeats;
double throughput = (numBytes / 1000.0) / millisecsPerRepeat;
Console.WriteLine( "{0,-33}: {1:0.0} ms, {2:0.0} mbytes/sec", mergerName, millisecsPerRepeat, throughput );
}
private static void Naive( byte[] data, byte[] mask, byte[] result )
{
for( int i = 0; i < data.Length; i++ )
{
result[i] = ( mask[i] == 0 ) ? data[i] : mask[i];
}
}
private static void Unrolled( byte[] data, byte[] mask, byte[] result )
{
for( int i = 0; i < data.Length; i += 10 )
{
result[i + 0] = ( mask[i + 0] == 0 ) ? data[i + 0] : mask[i + 0];
result[i + 1] = ( mask[i + 1] == 0 ) ? data[i + 1] : mask[i + 1];
result[i + 2] = ( mask[i + 2] == 0 ) ? data[i + 2] : mask[i + 2];
result[i + 3] = ( mask[i + 3] == 0 ) ? data[i + 3] : mask[i + 3];
result[i + 4] = ( mask[i + 4] == 0 ) ? data[i + 4] : mask[i + 4];
result[i + 5] = ( mask[i + 5] == 0 ) ? data[i + 5] : mask[i + 5];
result[i + 6] = ( mask[i + 6] == 0 ) ? data[i + 6] : mask[i + 6];
result[i + 7] = ( mask[i + 7] == 0 ) ? data[i + 7] : mask[i + 7];
result[i + 8] = ( mask[i + 8] == 0 ) ? data[i + 8] : mask[i + 8];
result[i + 9] = ( mask[i + 9] == 0 ) ? data[i + 9] : mask[i + 9];
}
}
private static void Vectorized( byte[] data, byte[] mask, byte[] result )
{
int byteVectorSize = Vector<byte>.Count;
Vector<byte> sourceVect;
Vector<byte> maskVect;
Vector<byte> emptyMaskBits;
Vector<byte> resultVector;
for( int i = 0; i < data.Length; i += byteVectorSize )
{
sourceVect = new Vector<byte>( data, i );
maskVect = new Vector<byte>( mask, i );
emptyMaskBits = Vector.Equals( maskVect, Vector<byte>.Zero );
resultVector = ( sourceVect & emptyMaskBits ) | maskVect;
resultVector.CopyTo( result, i );
}
}
private static void ThreadedNaive_MaxThreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount;
ThreadedNaive_Body( numThreads, data, mask, result );
}
private static void ThreadedNaive_NoHyperThreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount / 2;
ThreadedNaive_Body( numThreads, data, mask, result );
}
private static void ThreadedNaive_Body( int numThreads, byte[] data, byte[] mask, byte[] result )
{
int runLength = (int)(data.Length / numThreads);
var tokens = new List<ManualResetEventSlim>( numThreads );
for( int cpu = 0; cpu < numThreads; cpu++ )
{
int cpuInside = cpu;
int start = runLength * cpu;
int end = start + runLength - 1;
tokens.Add( new ManualResetEventSlim( false ) );
ThreadPool.QueueUserWorkItem( ( ignored ) =>
{
for( int i = start; i <= end; i++ )
{
result[i] = ( mask[i] == 0 ) ? data[i] : mask[i];
}
tokens[cpuInside].Set();
} );
}
foreach( var token in tokens )
{
token.Wait();
}
}
private static void ThreadedUnrolled_MaxThreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount;
ThreadedUnrolled_Body( numThreads, data, mask, result );
}
private static void ThreadedUnrolled_NoHyperthreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount / 2;
ThreadedUnrolled_Body( numThreads, data, mask, result );
}
private static void ThreadedUnrolled_Body( int numThreads, byte[] data, byte[] mask, byte[] result )
{
int runLength = (int)(data.Length / numThreads);
var tokens = new List<ManualResetEventSlim>( numThreads );
for( int cpu = 0; cpu < numThreads; cpu++ )
{
int cpuInside = cpu;
int start = runLength * cpu;
int end = start + runLength - 1;
tokens.Add( new ManualResetEventSlim( false ) );
ThreadPool.QueueUserWorkItem( ( ignored ) =>
{
for( int i = start; i <= end; i += 10 )
{
result[i + 0] = ( mask[i + 0] == 0 ) ? data[i + 0] : mask[i + 0];
result[i + 1] = ( mask[i + 1] == 0 ) ? data[i + 1] : mask[i + 1];
result[i + 2] = ( mask[i + 2] == 0 ) ? data[i + 2] : mask[i + 2];
result[i + 3] = ( mask[i + 3] == 0 ) ? data[i + 3] : mask[i + 3];
result[i + 4] = ( mask[i + 4] == 0 ) ? data[i + 4] : mask[i + 4];
result[i + 5] = ( mask[i + 5] == 0 ) ? data[i + 5] : mask[i + 5];
result[i + 6] = ( mask[i + 6] == 0 ) ? data[i + 6] : mask[i + 6];
result[i + 7] = ( mask[i + 7] == 0 ) ? data[i + 7] : mask[i + 7];
result[i + 8] = ( mask[i + 8] == 0 ) ? data[i + 8] : mask[i + 8];
result[i + 9] = ( mask[i + 9] == 0 ) ? data[i + 9] : mask[i + 9];
}
tokens[cpuInside].Set();
} );
}
foreach( var token in tokens )
{
token.Wait();
}
}
private static void ThreadedVectorize_MaxThreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount;
ThreadedVectorize_Body( numThreads, data, mask, result );
}
private static void ThreadedVectorize_NoHyperthreads( byte[] data, byte[] mask, byte[] result )
{
int numThreads = Environment.ProcessorCount / 2;
ThreadedVectorize_Body( numThreads, data, mask, result );
}
private static void ThreadedVectorize_Body( int numThreads, byte[] data, byte[] mask, byte[] result )
{
int runLength = (int)(data.Length / numThreads);
var tokens = new List<ManualResetEventSlim>( numThreads );
for( int cpu = 0; cpu < numThreads; cpu++ )
{
int cpuInside = cpu;
int start = runLength * cpu;
int end = start + runLength - 1;
tokens.Add( new ManualResetEventSlim( false ) );
ThreadPool.QueueUserWorkItem( ( ignored ) =>
{
int byteVectorSize = Vector<byte>.Count;
Vector<byte> sourceVect;
Vector<byte> maskVect;
Vector<byte> emptyMaskBits;
Vector<byte> resultVector;
for( int i = start; i < end; i += byteVectorSize )
{
sourceVect = new Vector<byte>( data, i );
maskVect = new Vector<byte>( mask, i );
emptyMaskBits = Vector.Equals( maskVect, Vector<byte>.Zero );
resultVector = ( sourceVect & emptyMaskBits ) | maskVect;
resultVector.CopyTo( result, i );
}
tokens[cpuInside].Set();
} );
}
foreach( var token in tokens )
{
token.Wait();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment