Skip to content

Instantly share code, notes, and snippets.

@alexyakunin
Created July 31, 2019 23:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexyakunin/8f18137df926f679affb2467ac139285 to your computer and use it in GitHub Desktop.
Save alexyakunin/8f18137df926f679affb2467ac139285 to your computer and use it in GitHub Desktop.
private static unsafe (long, int) ComputeSumSimd(
ReadOnlyMemory<byte> buffer, long sum, int n)
{
var span = buffer.Span;
fixed (byte* pStart = span) {
return ComputeSumSimd(new IntPtr(pStart), span.Length, sum, n);
}
}
private static unsafe (long, int) ComputeSumSimd(
IntPtr start, long length, long sum, int n)
{
var b7F = (byte) 127;
var bFF = (byte) 255;
var m7F = Avx2.BroadcastScalarToVector256(&b7F);
var mFF = Avx2.BroadcastScalarToVector256(&bFF);
var sum0 = Vector256<long>.Zero;
var sum7 = Vector256<long>.Zero;
var sum14 = Vector256<long>.Zero;
var sum21 = Vector256<long>.Zero;
var p = (byte*) start;
var pEnd = p + length;
// The following SIMD loop assumes n == 0 when it starts,
// so we have to reach this point "manually" here
if (n != 0) {
while (p < pEnd) {
var b = *p++;
n = (b & 127) + (n << 7);
if ((b & 128) != 0) {
sum += n;
n = 0;
break;
}
}
}
// SIMD loop
while (p + 36 <= pEnd) {
// Offset 0
var x = Avx2.LoadVector256(p);
// f indicates whether *p has flag (assuming p iterates through vector indexes);
// f[i] is either 0 (no flag) or -1 (i.e. all byte bits are set)
var f = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x.AsSByte()).AsByte();
x = Avx2.And(x, m7F);
// Offset 1
var x1 = Avx2.LoadVector256(p + 1);
var f1 = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x1.AsSByte()).AsByte();
// f01 indicates whether *p flag sequence is (0,1); similarly, f[i] is either 0 or -1
var f01 = Avx2.CompareGreaterThan(f.AsSByte(), f1.AsSByte()).AsByte();
// Offset 2
var x2 = Avx2.LoadVector256(p + 2);
var f2 = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x2.AsSByte()).AsByte();
var f00 = Avx2.Or(f, f1);
// f001 indicates whether *p flag sequence is (0,0,1)
var f001 = Avx2.CompareGreaterThan(f00.AsSByte(), f2.AsSByte()).AsByte();
var f000 = Avx2.Or(f00, f2);
// f0001 indicates whether *p flag sequence is (0,0,0,1)
// we assume here that the 4th byte always has a flag (i.e. the encoding
// is valid), so we don't read it.
var f0001 = Avx2.CompareGreaterThan(f000.AsSByte(), mFF.AsSByte()).AsByte();
sum0 = Avx2.Add(sum0, Avx2.SumAbsoluteDifferences(Avx2.And(x, f), Vector256<byte>.Zero).AsInt64());
sum7 = Avx2.Add(sum7, Avx2.SumAbsoluteDifferences(Avx2.And(x, f01), Vector256<byte>.Zero).AsInt64());
sum14 = Avx2.Add(sum14, Avx2.SumAbsoluteDifferences(Avx2.And(x, f001), Vector256<byte>.Zero).AsInt64());
sum21 = Avx2.Add(sum21, Avx2.SumAbsoluteDifferences(Avx2.And(x, f0001), Vector256<byte>.Zero).AsInt64());
p += 32;
}
var s07 = Avx2.Add(sum0, Avx2.ShiftLeftLogical(sum7, 7));
var s1421 = Avx2.Add(Avx2.ShiftLeftLogical(sum14, 14), Avx2.ShiftLeftLogical(sum21, 21));
var s = Avx2.Add(s07, s1421);
sum += s.GetElement(0) + s.GetElement(1) + s.GetElement(2) + s.GetElement(3);
n = 0; // Fine assuming we'll process 4+ items in the following loop
while (p < pEnd) {
var b = *p++;
n = (b & 127) + (n << 7);
if ((b & 128) != 0) {
sum += n;
n = 0;
}
}
return (sum, n);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment