Created
April 30, 2020 08:23
-
-
Save Turnerj/3887f6dcb8d56efa5c2881dd5793e3a7 to your computer and use it in GitHub Desktop.
Adler32 C# with Intrinsics (Not Passing Tests)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) Six Labors and contributors. | |
// See LICENSE for more details. | |
using System.Runtime.CompilerServices; | |
using System.Runtime.InteropServices; | |
#if NETCOREAPP3_1 | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
#endif | |
namespace SixLabors.ZlibStream | |
{ | |
internal static class Adler32 | |
{ | |
// Largest prime smaller than 65536 | |
private const int BASE = 65521; | |
// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 | |
private const int NMAX = 5552; | |
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] | |
public static unsafe long Calculate(long adler, byte[] buffer, int index, int length) | |
{ | |
/* | |
* Split Adler-32 into component sums. | |
*/ | |
var s1 = adler & 0xffff; | |
var s2 = adler >> 16; | |
#if NETCOREAPP3_1 | |
if (Sse41.IsSupported) | |
{ | |
/* | |
* Process the data in blocks. | |
*/ | |
const int BLOCK_SIZE = 1 << 5; | |
var blocks = length / BLOCK_SIZE; | |
length -= blocks * BLOCK_SIZE; | |
index += blocks * BLOCK_SIZE; | |
fixed (byte* bufferPtr = buffer) | |
{ | |
var localBufferPtr = bufferPtr; | |
var tap1 = Vector128.Create(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); | |
var tap2 = Vector128.Create(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); | |
Vector128<byte> zero = Vector128<byte>.Zero; | |
var ones = Vector128.Create((short)1); | |
while (blocks > 0) | |
{ | |
var n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ | |
if (n > blocks) | |
{ | |
n = blocks; | |
} | |
blocks -= n; | |
/* | |
* Process n blocks of data. At most NMAX data bytes can be | |
* processed before s2 must be reduced modulo BASE. | |
*/ | |
var v_ps = Vector128.Create(0, 0, 0, (int)s1 * n); | |
var v_s2 = Vector128.Create(0, 0, 0, (int)s2); | |
var v_s1 = Vector128.Create(0, 0, 0, 0); | |
do | |
{ | |
/* | |
* Load 32 input bytes. | |
*/ | |
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); | |
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16); | |
/* | |
* Add previous block byte sum to v_ps. | |
*/ | |
v_ps = Sse2.Add(v_ps, v_s1); | |
/* | |
* Horizontally add the bytes for s1, multiply-adds the | |
* bytes by [ 32, 31, 30, ... ] for s2. | |
*/ | |
v_s1 = Sse2.Add(v_s1, Sse41.ConvertToVector128Int32(Sse2.SumAbsoluteDifferences(bytes1, zero))); | |
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); | |
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones)); | |
v_s1 = Sse2.Add(v_s1, Sse41.ConvertToVector128Int32(Sse2.SumAbsoluteDifferences(bytes2, zero))); | |
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); | |
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones)); | |
localBufferPtr += BLOCK_SIZE; | |
} while (--n > 0); | |
v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); | |
/* | |
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2). | |
*/ | |
const byte S2301 = 0b1011_0001; /* A B C D -> B A D C */ | |
const byte S1032 = 0b0100_1110; /* A B C D -> C D A B */ | |
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); | |
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); | |
s1 += Sse2.ConvertToInt32(v_s1); | |
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); | |
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); | |
s2 = Sse2.ConvertToInt32(v_s2); | |
/* | |
* Reduce. | |
*/ | |
s1 %= BASE; | |
s2 %= BASE; | |
} | |
} | |
} | |
#endif | |
/* | |
* Handle leftover data. | |
*/ | |
if (length > 0) | |
{ | |
ref byte bufferRef = ref MemoryMarshal.GetReference<byte>(buffer); | |
while (length >= 16) | |
{ | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
length -= 16; | |
} | |
while (length > 0) | |
{ | |
s1 += Unsafe.Add(ref bufferRef, index++); | |
s2 += s1; | |
length--; | |
} | |
s1 %= BASE; | |
s2 %= BASE; | |
} | |
/* | |
* Return the recombined sums. | |
*/ | |
return s1 | (s2 << 16); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment