Skip to content

Instantly share code, notes, and snippets.

@mgravell
Created April 22, 2017 10:35
Show Gist options
  • Save mgravell/52009d6e890a389101867488af5350c5 to your computer and use it in GitHub Desktop.
Save mgravell/52009d6e890a389101867488af5350c5 to your computer and use it in GitHub Desktop.
Why is Vector.Widen slower than looping?
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
namespace WidenYUSlow
{
/*
packages: (may need myget feeds)
<PackageReference Include="benchmarkdotnet" Version="0.10.4" />
<PackageReference Include="System.Numerics.Vectors" Version="4.4.0-preview1-25219-04" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.4.0-preview1-25219-04" />
<PackageReference Include="System.Memory" Version="4.4.0-preview1-25219-04" />
my results:
Method | Mean | Error | StdDev | Scaled | ScaledSD |
-------------- |------------:|----------:|----------:|-------:|---------:|
Baseline | 111.7766 us | 0.3519 us | 0.3119 us | 1.00 | 0.00 |
VectorizeCopy | 135.9905 us | 1.2109 us | 1.0734 us | 1.22 | 0.01 |
VectorizeSpan | 135.2417 us | 1.4157 us | 1.1053 us | 1.21 | 0.01 |
*/
public class Program
{
static void Main()
{
GC.KeepAlive(_ascii); // force .cctor
var summary = BenchmarkRunner.Run<Program>();
Console.WriteLine(summary);
}
static Program()
{
const int Repeats = 512;
_ascii = new byte[Repeats * 128];
_chars = new char[_ascii.Length];
int index = 0;
for (int i = 0; i < Repeats; i++)
for (int j = 0; j < 128; j++)
_ascii[index++] = (byte)j;
// sense-check all versions during startup
DecodeAsciiByteByByte(_ascii, _chars, out int vectorLoops, out int byteLoops);
AssertAndErase(_ascii, _chars, vectorLoops, 0, byteLoops, _ascii.Length);
DecodeAsciiVectorizedViaCopy(_ascii, _chars, out vectorLoops, out byteLoops);
AssertAndErase(_ascii, _chars, vectorLoops, _ascii.Length / Vector<byte>.Count, byteLoops, _ascii.Length % Vector<byte>.Count);
DecodeAsciiVectorizedViaSpan(_ascii, _chars, out vectorLoops, out byteLoops);
AssertAndErase(_ascii, _chars, vectorLoops, _ascii.Length / Vector<byte>.Count, byteLoops, _ascii.Length % Vector<byte>.Count);
}
static void AssertAndErase(byte[] ascii, char[] chars, int actualVectorLoops, int expectedVectorLoops,
int actualByteLoops, int expectedByteLoops)
{
for(int i = 0 ; i < ascii.Length ; i++)
{
if(chars[i] != (char)ascii[i]) throw new InvalidOperationException("Data mismatch");
chars[i] = (char)0;
}
if(actualVectorLoops != expectedVectorLoops) throw new InvalidOperationException("Vector loop mismatch");
if(actualByteLoops != expectedByteLoops) throw new InvalidOperationException("Byte loop mismatch");
}
static byte[] _ascii;
static char[] _chars;
public Program() {
// touch the static fields to force .cctor
GC.KeepAlive(_ascii);
GC.KeepAlive(_chars);
}
const int OperationsPerInvoke = 250;
[Benchmark(Baseline = true, OperationsPerInvoke = OperationsPerInvoke)]
public int Baseline()
{
int total = 0;
for(int i = 0; i < OperationsPerInvoke; i++)
{
total += DecodeAsciiByteByByte(_ascii, _chars, out int vectorLoops, out int byteLoops);
}
return total;
}
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public int VectorizeCopy()
{
int total = 0;
for (int i = 0; i < OperationsPerInvoke; i++)
{
total += DecodeAsciiVectorizedViaCopy(_ascii, _chars, out int vectorLoops, out int byteLoops);
}
return total;
}
[Benchmark(OperationsPerInvoke = OperationsPerInvoke)]
public int VectorizeSpan()
{
int total = 0;
for (int i = 0; i < OperationsPerInvoke; i++)
{
total += DecodeAsciiVectorizedViaSpan(_ascii, _chars, out int vectorLoops, out int byteLoops);
}
return total;
}
private static int DecodeAsciiByteByByte(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops)
{
vectorLoops = byteLoops = 0;
int maxBytes = Math.Min(ascii.Length, chars.Length);
for(int i = 0; i < maxBytes; i++)
{
chars[i] = (char)ascii[i];
byteLoops++;
}
return maxBytes;
}
private static unsafe int DecodeAsciiVectorizedViaCopy(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops)
{
vectorLoops = byteLoops = 0;
int maxBytes = Math.Min(ascii.Length, chars.Length), i = 0;
if(Vector.IsHardwareAccelerated)
{
int simdLoops = maxBytes / Vector<byte>.Count;
i += simdLoops * Vector<byte>.Count;
fixed (byte* aBase = ascii)
fixed (char* cBase = chars)
{
byte* a = aBase;
char* c = cBase;
while (simdLoops-- != 0)
{
Vector<byte> v = Unsafe.Read<Vector<byte>>(a);
a += Vector<byte>.Count;
Vector.Widen(v, out Vector<ushort> x, out Vector<ushort> y);
Unsafe.Write(c, x);
c += Vector<ushort>.Count;
Unsafe.Write(c, y);
c += Vector<ushort>.Count;
vectorLoops++;
}
}
}
for (; i < maxBytes; i++) // any stragglers
{
chars[i] = (char)ascii[i];
byteLoops++;
}
return maxBytes;
}
private static int DecodeAsciiVectorizedViaSpan(byte[] ascii, char[] chars, out int vectorLoops, out int byteLoops)
{
vectorLoops = byteLoops = 0;
int maxBytes = Math.Min(ascii.Length, chars.Length), i = 0;
if(Vector.IsHardwareAccelerated)
{
// treat ascii as a span of Vector<byte>
var a = new Span<byte>(ascii)
.NonPortableCast<byte, Vector<byte>>();
// get a *reference* to the start of chars
// as a Vector<ushort>
ref var c = ref new Span<char>(chars)
.NonPortableCast<char, Vector<ushort>>()
.DangerousGetPinnableReference();
int simdLoops = maxBytes / Vector<byte>.Count;
i += simdLoops * Vector<byte>.Count;
int aIndex = 0, cIndex = 0;
while (simdLoops-- != 0)
{
Vector.Widen(a[aIndex++],
out Unsafe.Add(ref c, cIndex++),
out Unsafe.Add(ref c, cIndex++));
vectorLoops++;
}
}
for (; i < maxBytes; i++) // any stragglers
{
chars[i] = (char)ascii[i];
byteLoops++;
}
return maxBytes;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment