-
-
Save alexyakunin/8f18137df926f679affb2467ac139285 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static unsafe (long, int) ComputeSumSimd( | |
ReadOnlyMemory<byte> buffer, long sum, int n) | |
{ | |
var span = buffer.Span; | |
fixed (byte* pStart = span) { | |
return ComputeSumSimd(new IntPtr(pStart), span.Length, sum, n); | |
} | |
} | |
private static unsafe (long, int) ComputeSumSimd( | |
IntPtr start, long length, long sum, int n) | |
{ | |
var b7F = (byte) 127; | |
var bFF = (byte) 255; | |
var m7F = Avx2.BroadcastScalarToVector256(&b7F); | |
var mFF = Avx2.BroadcastScalarToVector256(&bFF); | |
var sum0 = Vector256<long>.Zero; | |
var sum7 = Vector256<long>.Zero; | |
var sum14 = Vector256<long>.Zero; | |
var sum21 = Vector256<long>.Zero; | |
var p = (byte*) start; | |
var pEnd = p + length; | |
// The following SIMD loop assumes n == 0 when it starts, | |
// so we have to reach this point "manually" here | |
if (n != 0) { | |
while (p < pEnd) { | |
var b = *p++; | |
n = (b & 127) + (n << 7); | |
if ((b & 128) != 0) { | |
sum += n; | |
n = 0; | |
break; | |
} | |
} | |
} | |
// SIMD loop | |
while (p + 36 <= pEnd) { | |
// Offset 0 | |
var x = Avx2.LoadVector256(p); | |
// f indicates whether *p has flag (assuming p iterates through vector indexes); | |
// f[i] is either 0 (no flag) or -1 (i.e. all byte bits are set) | |
var f = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x.AsSByte()).AsByte(); | |
x = Avx2.And(x, m7F); | |
// Offset 1 | |
var x1 = Avx2.LoadVector256(p + 1); | |
var f1 = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x1.AsSByte()).AsByte(); | |
// f01 indicates whether *p flag sequence is (0,1); similarly, f[i] is either 0 or -1 | |
var f01 = Avx2.CompareGreaterThan(f.AsSByte(), f1.AsSByte()).AsByte(); | |
// Offset 2 | |
var x2 = Avx2.LoadVector256(p + 2); | |
var f2 = Avx2.CompareGreaterThan(Vector256<sbyte>.Zero, x2.AsSByte()).AsByte(); | |
var f00 = Avx2.Or(f, f1); | |
// f001 indicates whether *p flag sequence is (0,0,1) | |
var f001 = Avx2.CompareGreaterThan(f00.AsSByte(), f2.AsSByte()).AsByte(); | |
var f000 = Avx2.Or(f00, f2); | |
// f0001 indicates whether *p flag sequence is (0,0,0,1) | |
// we assume here that the 4th byte always has a flag (i.e. the encoding | |
// is valid), so we don't read it. | |
var f0001 = Avx2.CompareGreaterThan(f000.AsSByte(), mFF.AsSByte()).AsByte(); | |
sum0 = Avx2.Add(sum0, Avx2.SumAbsoluteDifferences(Avx2.And(x, f), Vector256<byte>.Zero).AsInt64()); | |
sum7 = Avx2.Add(sum7, Avx2.SumAbsoluteDifferences(Avx2.And(x, f01), Vector256<byte>.Zero).AsInt64()); | |
sum14 = Avx2.Add(sum14, Avx2.SumAbsoluteDifferences(Avx2.And(x, f001), Vector256<byte>.Zero).AsInt64()); | |
sum21 = Avx2.Add(sum21, Avx2.SumAbsoluteDifferences(Avx2.And(x, f0001), Vector256<byte>.Zero).AsInt64()); | |
p += 32; | |
} | |
var s07 = Avx2.Add(sum0, Avx2.ShiftLeftLogical(sum7, 7)); | |
var s1421 = Avx2.Add(Avx2.ShiftLeftLogical(sum14, 14), Avx2.ShiftLeftLogical(sum21, 21)); | |
var s = Avx2.Add(s07, s1421); | |
sum += s.GetElement(0) + s.GetElement(1) + s.GetElement(2) + s.GetElement(3); | |
n = 0; // Fine assuming we'll process 4+ items in the following loop | |
while (p < pEnd) { | |
var b = *p++; | |
n = (b & 127) + (n << 7); | |
if ((b & 128) != 0) { | |
sum += n; | |
n = 0; | |
} | |
} | |
return (sum, n); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment