Last active
February 8, 2018 18:50
-
-
Save antiduh/aa111df5c6ebb9fd75a5bbc04afce1e3 to your computer and use it in GitHub Desktop.
Fastest byte array merge for reddit, https://www.reddit.com/r/csharp/comments/7w09vc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.Numerics; | |
using System.Threading; | |
namespace MergeBenchmark | |
{ | |
internal static class Program | |
{ | |
/// <summary> | |
/// The main entry point for the application. | |
/// </summary> | |
[STAThread] | |
private static void Main() | |
{ | |
/* | |
Results: | |
Naive : 1074.0 ms, 878.7 mbytes/sec | |
ThreadedNaive_MaxThreads : 262.0 ms, 3602.0 mbytes/sec | |
ThreadedNaive_NoHyperThreads : 327.0 ms, 2886.0 mbytes/sec | |
Unrolled : 805.0 ms, 1172.3 mbytes/sec | |
ThreadedUnrolled_MaxThreads : 263.0 ms, 3588.3 mbytes/sec | |
ThreadedUnrolled_NoHyperthreads : 322.0 ms, 2930.8 mbytes/sec | |
Vector : 210.0 ms, 4493.9 mbytes/sec | |
ThreadedVectorize_MaxThreads : 177.0 ms, 5331.7 mbytes/sec | |
ThreadedVectorize_NoHyperthreads : 181.0 ms, 5213.9 mbytes/sec | |
*/ | |
if( Vector.IsHardwareAccelerated == false ) | |
{ | |
throw new InvalidOperationException( "Invalid benchmark" ); | |
} | |
Process.GetCurrentProcess().PriorityClass = ProcessPriorityClass.High; | |
int numBytes = 900 * 1024 * 1024; | |
byte[] data = new byte[ numBytes ]; | |
byte[] mask = new byte[ numBytes ]; | |
byte[] result = new byte[ numBytes ]; | |
FillData( data, mask ); | |
TimeMerger( "Naive", numBytes, () => Naive( data, mask, result ) ); | |
TimeMerger( "ThreadedNaive_MaxThreads", numBytes, () => ThreadedNaive_MaxThreads( data, mask, result ) ); | |
TimeMerger( "ThreadedNaive_NoHyperThreads", numBytes, () => ThreadedNaive_NoHyperThreads( data, mask, result ) ); | |
Console.WriteLine(); | |
TimeMerger( "Unrolled", numBytes, () => Unrolled( data, mask, result ) ); | |
TimeMerger( "ThreadedUnrolled_MaxThreads", numBytes, () => ThreadedUnrolled_MaxThreads( data, mask, result ) ); | |
TimeMerger( "ThreadedUnrolled_NoHyperthreads", numBytes, () => ThreadedUnrolled_NoHyperthreads( data, mask, result ) ); | |
Console.WriteLine(); | |
TimeMerger( "Vector", numBytes, () => Vectorized( data, mask, result ) ); | |
TimeMerger( "ThreadedVectorize_MaxThreads", numBytes, () => ThreadedVectorize_MaxThreads( data, mask, result ) ); | |
TimeMerger( "ThreadedVectorize_NoHyperthreads", numBytes, () => ThreadedVectorize_NoHyperthreads( data, mask, result ) ); | |
Console.WriteLine( "Press any key to continue..." ); | |
Console.ReadLine(); | |
} | |
private static void FillData( byte[] data, byte[] mask ) | |
{ | |
for( int i = 0; i < mask.Length; i += 10 ) | |
{ | |
data[i + 0] = (byte)( i + 0 ); | |
data[i + 1] = (byte)( i + 1 ); | |
data[i + 2] = (byte)( i + 2 ); | |
data[i + 3] = (byte)( i + 3 ); | |
data[i + 4] = (byte)( i + 4 ); | |
data[i + 5] = (byte)( i + 5 ); | |
data[i + 6] = (byte)( i + 6 ); | |
data[i + 7] = (byte)( i + 7 ); | |
data[i + 8] = (byte)( i + 8 ); | |
data[i + 9] = (byte)( i + 9 ); | |
} | |
for( int i = 0; i < mask.Length; i += 10 ) | |
{ | |
mask[i + 0] = 42; | |
mask[i + 1] = 0; | |
mask[i + 2] = 0; | |
mask[i + 3] = 0; | |
mask[i + 4] = 0; | |
mask[i + 5] = 0; | |
mask[i + 6] = 0; | |
mask[i + 7] = 0; | |
mask[i + 8] = 0; | |
mask[i + 9] = 0; | |
} | |
} | |
private static void TimeMerger( string mergerName, int numBytes, Action merger ) | |
{ | |
Stopwatch watch = new Stopwatch(); | |
int numRepeats = 5; | |
// Prime the merge impl to ensure JIT is already done. | |
merger(); | |
watch.Start(); | |
for( int i = 0; i < numRepeats; i++ ) | |
{ | |
merger(); | |
} | |
watch.Stop(); | |
long millisecsPerRepeat = watch.ElapsedMilliseconds / numRepeats; | |
double throughput = (numBytes / 1000.0) / millisecsPerRepeat; | |
Console.WriteLine( "{0,-33}: {1:0.0} ms, {2:0.0} mbytes/sec", mergerName, millisecsPerRepeat, throughput ); | |
} | |
private static void Naive( byte[] data, byte[] mask, byte[] result ) | |
{ | |
for( int i = 0; i < data.Length; i++ ) | |
{ | |
result[i] = ( mask[i] == 0 ) ? data[i] : mask[i]; | |
} | |
} | |
private static void Unrolled( byte[] data, byte[] mask, byte[] result ) | |
{ | |
for( int i = 0; i < data.Length; i += 10 ) | |
{ | |
result[i + 0] = ( mask[i + 0] == 0 ) ? data[i + 0] : mask[i + 0]; | |
result[i + 1] = ( mask[i + 1] == 0 ) ? data[i + 1] : mask[i + 1]; | |
result[i + 2] = ( mask[i + 2] == 0 ) ? data[i + 2] : mask[i + 2]; | |
result[i + 3] = ( mask[i + 3] == 0 ) ? data[i + 3] : mask[i + 3]; | |
result[i + 4] = ( mask[i + 4] == 0 ) ? data[i + 4] : mask[i + 4]; | |
result[i + 5] = ( mask[i + 5] == 0 ) ? data[i + 5] : mask[i + 5]; | |
result[i + 6] = ( mask[i + 6] == 0 ) ? data[i + 6] : mask[i + 6]; | |
result[i + 7] = ( mask[i + 7] == 0 ) ? data[i + 7] : mask[i + 7]; | |
result[i + 8] = ( mask[i + 8] == 0 ) ? data[i + 8] : mask[i + 8]; | |
result[i + 9] = ( mask[i + 9] == 0 ) ? data[i + 9] : mask[i + 9]; | |
} | |
} | |
private static void Vectorized( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int byteVectorSize = Vector<byte>.Count; | |
Vector<byte> sourceVect; | |
Vector<byte> maskVect; | |
Vector<byte> emptyMaskBits; | |
Vector<byte> resultVector; | |
for( int i = 0; i < data.Length; i += byteVectorSize ) | |
{ | |
sourceVect = new Vector<byte>( data, i ); | |
maskVect = new Vector<byte>( mask, i ); | |
emptyMaskBits = Vector.Equals( maskVect, Vector<byte>.Zero ); | |
resultVector = ( sourceVect & emptyMaskBits ) | maskVect; | |
resultVector.CopyTo( result, i ); | |
} | |
} | |
private static void ThreadedNaive_MaxThreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount; | |
ThreadedNaive_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedNaive_NoHyperThreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount / 2; | |
ThreadedNaive_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedNaive_Body( int numThreads, byte[] data, byte[] mask, byte[] result ) | |
{ | |
int runLength = (int)(data.Length / numThreads); | |
var tokens = new List<ManualResetEventSlim>( numThreads ); | |
for( int cpu = 0; cpu < numThreads; cpu++ ) | |
{ | |
int cpuInside = cpu; | |
int start = runLength * cpu; | |
int end = start + runLength - 1; | |
tokens.Add( new ManualResetEventSlim( false ) ); | |
ThreadPool.QueueUserWorkItem( ( ignored ) => | |
{ | |
for( int i = start; i <= end; i++ ) | |
{ | |
result[i] = ( mask[i] == 0 ) ? data[i] : mask[i]; | |
} | |
tokens[cpuInside].Set(); | |
} ); | |
} | |
foreach( var token in tokens ) | |
{ | |
token.Wait(); | |
} | |
} | |
private static void ThreadedUnrolled_MaxThreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount; | |
ThreadedUnrolled_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedUnrolled_NoHyperthreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount / 2; | |
ThreadedUnrolled_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedUnrolled_Body( int numThreads, byte[] data, byte[] mask, byte[] result ) | |
{ | |
int runLength = (int)(data.Length / numThreads); | |
var tokens = new List<ManualResetEventSlim>( numThreads ); | |
for( int cpu = 0; cpu < numThreads; cpu++ ) | |
{ | |
int cpuInside = cpu; | |
int start = runLength * cpu; | |
int end = start + runLength - 1; | |
tokens.Add( new ManualResetEventSlim( false ) ); | |
ThreadPool.QueueUserWorkItem( ( ignored ) => | |
{ | |
for( int i = start; i <= end; i += 10 ) | |
{ | |
result[i + 0] = ( mask[i + 0] == 0 ) ? data[i + 0] : mask[i + 0]; | |
result[i + 1] = ( mask[i + 1] == 0 ) ? data[i + 1] : mask[i + 1]; | |
result[i + 2] = ( mask[i + 2] == 0 ) ? data[i + 2] : mask[i + 2]; | |
result[i + 3] = ( mask[i + 3] == 0 ) ? data[i + 3] : mask[i + 3]; | |
result[i + 4] = ( mask[i + 4] == 0 ) ? data[i + 4] : mask[i + 4]; | |
result[i + 5] = ( mask[i + 5] == 0 ) ? data[i + 5] : mask[i + 5]; | |
result[i + 6] = ( mask[i + 6] == 0 ) ? data[i + 6] : mask[i + 6]; | |
result[i + 7] = ( mask[i + 7] == 0 ) ? data[i + 7] : mask[i + 7]; | |
result[i + 8] = ( mask[i + 8] == 0 ) ? data[i + 8] : mask[i + 8]; | |
result[i + 9] = ( mask[i + 9] == 0 ) ? data[i + 9] : mask[i + 9]; | |
} | |
tokens[cpuInside].Set(); | |
} ); | |
} | |
foreach( var token in tokens ) | |
{ | |
token.Wait(); | |
} | |
} | |
private static void ThreadedVectorize_MaxThreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount; | |
ThreadedVectorize_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedVectorize_NoHyperthreads( byte[] data, byte[] mask, byte[] result ) | |
{ | |
int numThreads = Environment.ProcessorCount / 2; | |
ThreadedVectorize_Body( numThreads, data, mask, result ); | |
} | |
private static void ThreadedVectorize_Body( int numThreads, byte[] data, byte[] mask, byte[] result ) | |
{ | |
int runLength = (int)(data.Length / numThreads); | |
var tokens = new List<ManualResetEventSlim>( numThreads ); | |
for( int cpu = 0; cpu < numThreads; cpu++ ) | |
{ | |
int cpuInside = cpu; | |
int start = runLength * cpu; | |
int end = start + runLength - 1; | |
tokens.Add( new ManualResetEventSlim( false ) ); | |
ThreadPool.QueueUserWorkItem( ( ignored ) => | |
{ | |
int byteVectorSize = Vector<byte>.Count; | |
Vector<byte> sourceVect; | |
Vector<byte> maskVect; | |
Vector<byte> emptyMaskBits; | |
Vector<byte> resultVector; | |
for( int i = start; i < end; i += byteVectorSize ) | |
{ | |
sourceVect = new Vector<byte>( data, i ); | |
maskVect = new Vector<byte>( mask, i ); | |
emptyMaskBits = Vector.Equals( maskVect, Vector<byte>.Zero ); | |
resultVector = ( sourceVect & emptyMaskBits ) | maskVect; | |
resultVector.CopyTo( result, i ); | |
} | |
tokens[cpuInside].Set(); | |
} ); | |
} | |
foreach( var token in tokens ) | |
{ | |
token.Wait(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment