Created
November 23, 2019 11:54
-
-
Save Const-me/3cb028a44abe2f207d147c167563b297 to your computer and use it in GitHub Desktop.
Manually propagated invariants, and fixed benchmarking code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
namespace SimdBrightness | |
{ | |
static class Program | |
{ | |
/// <summary>Load 4 pixels of RGB</summary> | |
static unsafe Vector128<int> load4( byte* src ) | |
{ | |
return Sse2.LoadVector128( (int*)src ); | |
} | |
/// <summary>Pack red channel of 8 pixels into ushort values in [ 0xFF00 .. 0 ] interval</summary> | |
static Vector128<ushort> packRed( Vector128<int> a, Vector128<int> b, Vector128<int> mask ) | |
{ | |
a = Sse2.And( a, mask ); | |
b = Sse2.And( b, mask ); | |
return Sse2.ShiftLeftLogical128BitLane( Sse41.PackUnsignedSaturate( a, b ), 1 ); | |
} | |
/// <summary>Pack green channel of 8 pixels into ushort values in [ 0xFF00 .. 0 ] interval</summary> | |
static Vector128<ushort> packGreen( Vector128<int> a, Vector128<int> b, Vector128<int> mask ) | |
{ | |
a = Sse2.And( a, mask ); | |
b = Sse2.And( b, mask ); | |
return Sse41.PackUnsignedSaturate( a, b ); | |
} | |
/// <summary>Pack blue channel of 8 pixels into ushort values in [ 0xFF00 .. 0 ] interval</summary> | |
static Vector128<ushort> packBlue( Vector128<int> a, Vector128<int> b, Vector128<int> mask ) | |
{ | |
a = Sse2.ShiftRightLogical128BitLane( a, 1 ); | |
b = Sse2.ShiftRightLogical128BitLane( b, 1 ); | |
a = Sse2.And( a, mask ); | |
b = Sse2.And( b, mask ); | |
return Sse41.PackUnsignedSaturate( a, b ); | |
} | |
/// <summary>Load 8 pixels, split into RGB channels.</summary> | |
static unsafe void loadRgb( byte* src, out Vector128<ushort> red, out Vector128<ushort> green, out Vector128<ushort> blue, Vector128<int> lowByte, Vector128<int> secondByte ) | |
{ | |
var a = load4( src ); | |
var b = load4( src + 16 ); | |
red = packRed( a, b, lowByte ); | |
green = packGreen( a, b, secondByte ); | |
blue = packBlue( a, b, secondByte ); | |
} | |
/// <summary>Compute brightness of 8 pixels</summary> | |
static Vector128<short> brightness( Vector128<ushort> r, Vector128<ushort> g, Vector128<ushort> b, Vector128<ushort> redMul, Vector128<ushort> greenMul, Vector128<ushort> blueMul ) | |
{ | |
r = Sse2.MultiplyHigh( r, redMul ); | |
g = Sse2.MultiplyHigh( g, greenMul ); | |
b = Sse2.MultiplyHigh( b, blueMul ); | |
var result = Sse2.AddSaturate( Sse2.AddSaturate( r, g ), b ); | |
return Vector128.AsInt16( Sse2.ShiftRightLogical( result, 8 ) ); | |
} | |
const ushort mulRed = (ushort)( 0.29891 * 0x10000 ); | |
const ushort mulGreen = (ushort)( 0.58661 * 0x10000 ); | |
const ushort mulBlue = (ushort)( 0.11448 * 0x10000 ); | |
/// <summary>Convert buffer from RGBA to grayscale.</summary> | |
/// <remarks> | |
/// <para>If your image has line paddings, you'll want to call this once per line, not for the complete image.</para> | |
/// <para>If width of the image is not multiple of 16 pixels, you'll need to do more work to handle the last few pixels of every line.</para> | |
/// </remarks> | |
static unsafe void convertToGrayscale( byte* src, byte* dst, long count ) | |
{ | |
var lowByte = Vector128.Create( 0xFF ); | |
var secondByte = Vector128.Create( 0xFF00 ); | |
var redMul = Vector128.Create( mulRed ); | |
var greenMul = Vector128.Create( mulGreen ); | |
var blueMul = Vector128.Create( mulBlue ); | |
byte* srcEnd = src + count * 4; | |
while( src < srcEnd ) | |
{ | |
loadRgb( src, out var r, out var g, out var b, lowByte, secondByte ); | |
var low = brightness( r, g, b, redMul, greenMul, blueMul ); | |
loadRgb( src + 32, out r, out g, out b, lowByte, secondByte ); | |
var hi = brightness( r, g, b, redMul, greenMul, blueMul ); | |
var bytes = Sse2.PackUnsignedSaturate( low, hi ); | |
Sse2.Store( dst, bytes ); | |
src += 64; | |
dst += 16; | |
} | |
} | |
const int pixelCount = 1024 * 1024; | |
static unsafe void Main( string[] args ) | |
{ | |
byte[] source = new byte[ 4 * pixelCount ]; | |
byte[] dest = new byte[ pixelCount ]; | |
// First time the code runs much slower, 8x slower for 1M pixels, 2x slower for 511M pixels, because it takes time to produce x86 out of IL. | |
// Could also be due to caching but I doubt it, 511M pixels take almost 2GB RAM and I only have 16MB L3 cache. | |
new Random( 0 ).NextBytes( source ); | |
fixed( byte* pSource = source ) | |
fixed( byte* pDest = dest ) | |
convertToGrayscale( pSource, pDest, pixelCount ); | |
// Second time it's actually useful, measuring this one. | |
new Random( 11 ).NextBytes( source ); | |
Stopwatch sw; | |
fixed( byte* pSource = source ) | |
fixed( byte* pDest = dest ) | |
{ | |
sw = Stopwatch.StartNew(); | |
convertToGrayscale( pSource, pDest, pixelCount ); | |
sw.Stop(); | |
} | |
Console.WriteLine( "{0} ms", sw.Elapsed.TotalMilliseconds ); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment