Created
April 22, 2017 10:35
-
-
Save mgravell/52009d6e890a389101867488af5350c5 to your computer and use it in GitHub Desktop.
Why is Vector.Widen slower than looping?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Running; | |
using System; | |
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
namespace WidenYUSlow | |
{ | |
/* | |
packages: (may need myget feeds) | |
<PackageReference Include="benchmarkdotnet" Version="0.10.4" /> | |
<PackageReference Include="System.Numerics.Vectors" Version="4.4.0-preview1-25219-04" /> | |
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.4.0-preview1-25219-04" /> | |
<PackageReference Include="System.Memory" Version="4.4.0-preview1-25219-04" /> | |
my results: | |
Method | Mean | Error | StdDev | Scaled | ScaledSD | | |
-------------- |------------:|----------:|----------:|-------:|---------:| | |
Baseline | 111.7766 us | 0.3519 us | 0.3119 us | 1.00 | 0.00 | | |
VectorizeCopy | 135.9905 us | 1.2109 us | 1.0734 us | 1.22 | 0.01 | | |
VectorizeSpan | 135.2417 us | 1.4157 us | 1.1053 us | 1.21 | 0.01 | | |
*/ | |
public class Program | |
{ | |
static void Main() | |
{ | |
GC.KeepAlive(_ascii); // force .cctor | |
var summary = BenchmarkRunner.Run<Program>(); | |
Console.WriteLine(summary); | |
} | |
static Program() | |
{ | |
const int Repeats = 512; | |
_ascii = new byte[Repeats * 128]; | |
_chars = new char[_ascii.Length]; | |
int index = 0; | |
for (int i = 0; i < Repeats; i++) | |
for (int j = 0; j < 128; j++) | |
_ascii[index++] = (byte)j; | |
// sense-check all versions during startup | |
DecodeAsciiByteByByte(_ascii, _chars, out int vectorLoops, out int byteLoops); | |
AssertAndErase(_ascii, _chars, vectorLoops, 0, byteLoops, _ascii.Length); | |
DecodeAsciiVectorizedViaCopy(_ascii, _chars, out vectorLoops, out byteLoops); | |
AssertAndErase(_ascii, _chars, vectorLoops, _ascii.Length / Vector<byte>.Count, byteLoops, _ascii.Length % Vector<byte>.Count); | |
DecodeAsciiVectorizedViaSpan(_ascii, _chars, out vectorLoops, out byteLoops); | |
AssertAndErase(_ascii, _chars, vectorLoops, _ascii.Length / Vector<byte>.Count, byteLoops, _ascii.Length % Vector<byte>.Count); | |
} | |
static void AssertAndErase(byte[] ascii, char[] chars, int actualVectorLoops, int expectedVectorLoops, | |
int actualByteLoops, int expectedByteLoops) | |
{ | |
for(int i = 0 ; i < ascii.Length ; i++) | |
{ | |
if(chars[i] != (char)ascii[i]) throw new InvalidOperationException("Data mismatch"); | |
chars[i] = (char)0; | |
} | |
if(actualVectorLoops != expectedVectorLoops) throw new InvalidOperationException("Vector loop mismatch"); | |
if(actualByteLoops != expectedByteLoops) throw new InvalidOperationException("Byte loop mismatch"); | |
} | |
static byte[] _ascii; | |
static char[] _chars; | |
public Program() { | |
// touch the static fields to force .cctor | |
GC.KeepAlive(_ascii); | |
GC.KeepAlive(_chars); | |
} | |
const int OperationsPerInvoke = 250; | |
[Benchmark(Baseline = true, OperationsPerInvoke = OperationsPerInvoke)] | |
public int Baseline() | |
{ | |
int total = 0; | |
for(int i = 0; i < OperationsPerInvoke; i++) | |
{ | |
total += DecodeAsciiByteByByte(_ascii, _chars, out int vectorLoops, out int byteLoops); | |
} | |
return total; | |
} | |
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)] | |
public int VectorizeCopy() | |
{ | |
int total = 0; | |
for (int i = 0; i < OperationsPerInvoke; i++) | |
{ | |
total += DecodeAsciiVectorizedViaCopy(_ascii, _chars, out int vectorLoops, out int byteLoops); | |
} | |
return total; | |
} | |
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)] | |
public int VectorizeSpan() | |
{ | |
int total = 0; | |
for (int i = 0; i < OperationsPerInvoke; i++) | |
{ | |
total += DecodeAsciiVectorizedViaSpan(_ascii, _chars, out int vectorLoops, out int byteLoops); | |
} | |
return total; | |
} | |
private static int DecodeAsciiByteByByte(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops) | |
{ | |
vectorLoops = byteLoops = 0; | |
int maxBytes = Math.Min(ascii.Length, chars.Length); | |
for(int i = 0; i < maxBytes; i++) | |
{ | |
chars[i] = (char)ascii[i]; | |
byteLoops++; | |
} | |
return maxBytes; | |
} | |
private static unsafe int DecodeAsciiVectorizedViaCopy(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops) | |
{ | |
vectorLoops = byteLoops = 0; | |
int maxBytes = Math.Min(ascii.Length, chars.Length), i = 0; | |
if(Vector.IsHardwareAccelerated) | |
{ | |
int simdLoops = maxBytes / Vector<byte>.Count; | |
i += simdLoops * Vector<byte>.Count; | |
fixed (byte* aBase = ascii) | |
fixed (char* cBase = chars) | |
{ | |
byte* a = aBase; | |
char* c = cBase; | |
while (simdLoops-- != 0) | |
{ | |
Vector<byte> v = Unsafe.Read<Vector<byte>>(a); | |
a += Vector<byte>.Count; | |
Vector.Widen(v, out Vector<ushort> x, out Vector<ushort> y); | |
Unsafe.Write(c, x); | |
c += Vector<ushort>.Count; | |
Unsafe.Write(c, y); | |
c += Vector<ushort>.Count; | |
vectorLoops++; | |
} | |
} | |
} | |
for (; i < maxBytes; i++) // any stragglers | |
{ | |
chars[i] = (char)ascii[i]; | |
byteLoops++; | |
} | |
return maxBytes; | |
} | |
private static int DecodeAsciiVectorizedViaSpan(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops) | |
{ | |
vectorLoops = byteLoops = 0; | |
int maxBytes = Math.Min(ascii.Length, chars.Length), i = 0; | |
if(Vector.IsHardwareAccelerated) | |
{ | |
// treat ascii as a span of Vector<byte> | |
var a = new Span<byte>(ascii) | |
.NonPortableCast<byte, Vector<byte>>(); | |
// get a *reference* to the start of chars | |
// as a Vector<ushort> | |
ref var c = ref new Span<char>(chars) | |
.NonPortableCast<char, Vector<ushort>>() | |
.DangerousGetPinnableReference(); | |
int simdLoops = maxBytes / Vector<byte>.Count; | |
i += simdLoops * Vector<byte>.Count; | |
int aIndex = 0, cIndex = 0; | |
while (simdLoops-- != 0) | |
{ | |
Vector.Widen(a[aIndex++], | |
out Unsafe.Add(ref c, cIndex++), | |
out Unsafe.Add(ref c, cIndex++)); | |
vectorLoops++; | |
} | |
} | |
for (; i < maxBytes; i++) // any stragglers | |
{ | |
chars[i] = (char)ascii[i]; | |
byteLoops++; | |
} | |
return maxBytes; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment