Skip to content

Instantly share code, notes, and snippets.

@xoofx
Last active September 14, 2019 07:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xoofx/7126ec8ac1ce01e98cef04771c48cb33 to your computer and use it in GitHub Desktop.
Save xoofx/7126ec8ac1ce01e98cef04771c48cb33 to your computer and use it in GitHub Desktop.
Experiment with XXH3Fast and XXH3FastSSE
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Text;
using BenchmarkDotNet.Attributes;
using StarkPlatform.NativeCompiler.Utils;
namespace StarkPlatform.NativeCompiler.Benchmarks
{
[DisassemblyDiagnoser]
public class BenchXXH3
{
[Benchmark]
public Vector128<ulong> FastHash()
{
var xxH3 = new XXH3Fast();
xxH3.Write(1);
xxH3.Write(2);
xxH3.Write(3);
xxH3.Write(4);
return xxH3.Compute(4);
}
[Benchmark]
public Vector128<ulong> FastHash_SSE()
{
var xxH3 = new XXH3FastSSE();
xxH3.Write(1);
xxH3.Write(2);
xxH3.Write(3);
xxH3.Write(4);
return xxH3.Compute(4);
}
static unsafe string ToString<T>(Vector256<T> vector) where T : struct
{
var ptr = (byte*)Unsafe.AsPointer(ref vector);
var text = new StringBuilder(32 * 2);
for (int i = 0; i < 32; i++)
{
if (i > 0 && (i % 4) == 0)
{
text.Append(' ');
}
text.Append(ptr[i].ToString("x2"));
}
return text.ToString();
}
static unsafe string ToString<T>(Vector128<T> vector) where T : struct
{
var ptr = (byte*) Unsafe.AsPointer(ref vector);
var text = new StringBuilder(16 * 2);
for (int i = 0; i < 16; i++)
{
if (i > 0 && (i % 4) == 0)
{
text.Append(' ');
}
text.Append(ptr[i].ToString("x2"));
}
return text.ToString();
}
}
}
using BenchmarkDotNet.Running;
namespace StarkPlatform.NativeCompiler.Benchmarks
{
public class Program
{
static void Main(string[] args)
{
BenchmarkRunner.Run<BenchXXH3>();
}
}
}
BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
.NET Core SDK=3.0.100-preview9-014004
  [Host]     : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
  DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT

Method Mean Error StdDev
FastHash 22.98 ns 0.2925 ns 0.2593 ns
FastHash_SSE 22.96 ns 0.1420 ns 0.1328 ns
// License BSD-2 Clause - Copyright (c) 2019-present, Alexandre Mutel
// Derived work from: https://github.com/Cyan4973/xxHash/blob/dev/xxh3.h
// xxHash - Extremely Fast Hash algorithm
// Development source file for `xxh3`
// Copyright (C) 2019-present, Yann Collet.
// BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace StarkPlatform.NativeCompiler.Utils
{
/// <summary>
/// An incremental 128-bits hash derived from XXH3
/// </summary>
/// <remarks>
/// This is a derived implementation from XXH3.
/// The main difference with the original algorithm is that we expect the working set
/// of data to be small (&lt; 1024 bytes), so we are basically rewriting the function
/// XXH3_accumulate_512
/// https://github.com/Cyan4973/xxHash/blob/77fd98f6b57c6c17ec43606c74fa8cb96eb17dc0/xxh3.h#L663-L684
/// to work per integer instead and we are only using 32 bytes for the accumulator (instead of 64 bytes)
/// as in the original XXH3
/// </remarks>
[StructLayout(LayoutKind.Explicit)]
public unsafe struct XXH3Fast
{
[FieldOffset(0)]
private fixed uint _accumulator[ACCUMULATOR_COUNT];
[FieldOffset(0)]
private fixed byte _accumulatorBytes[ACCUMULATOR_SIZE];
[FieldOffset(32)]
private int _secretOffset;
[FieldOffset(36)]
private int _accOffset;
/// <summary>
/// Write an integer value to the hash
/// </summary>
/// <param name="data">The integer data to add to the hash</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Write(uint data)
{
ref var localSecret = ref Unsafe.AddByteOffset(ref SecretUInt, new IntPtr(_secretOffset));
_secretOffset = (_secretOffset + sizeof(uint)) & (XXH_SECRET_DEFAULT_SIZE - 1);
var key1 = localSecret ^ data;
Unsafe.As<byte, uint>(ref _accumulatorBytes[_accOffset]) += (ushort)key1 * (key1 >> 16) + data;
_accOffset = (_accOffset + sizeof(uint)) & (ACCUMULATOR_SIZE - 1);
}
/// <summary>
/// Computes the final hash from previous written data.
/// </summary>
/// <param name="count">The number of <see cref="Write"/> issued</param>
/// <returns>A hash as a 128-bits</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Vector128<ulong> Compute(uint count)
{
return Vector128.Create(XXH3_mergeAccs_low(count * PRIME64_1), XXH3_mergeAccs_high(~(count * PRIME64_2)));
}
// ReSharper disable InconsistentNaming
private const ulong PRIME64_1 = 11400714785074694791UL;
private const ulong PRIME64_2 = 14029467366897019727UL;
private const ulong PRIME64_3 = 1609587929392839161UL;
private const int ACCUMULATOR_COUNT = 8;
private const int ACCUMULATOR_SIZE = ACCUMULATOR_COUNT * sizeof(uint);
private const int XXH_SECRET_DEFAULT_SIZE = 192;
private static ReadOnlySpan<byte> SecretAsBytes => new ReadOnlySpan<byte>(new byte[XXH_SECRET_DEFAULT_SIZE] {
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
});
private static ref uint SecretUInt
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes.GetPinnableReference()));
}
private static ref uint SecretUIntLow
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[11]));
}
private static ref uint SecretUIntHigh
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[XXH_SECRET_DEFAULT_SIZE - ACCUMULATOR_SIZE - 11]));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong XXH3_mix2Accs(uint acc1, uint acc2, uint secret1, uint secret2)
{
return (ulong)(acc1 ^ secret1) * (acc2 ^ secret2);
}
private ulong XXH3_mergeAccs_low(ulong start)
{
// inlined version of XXH3_mergeAccs with SecretUIntLow
ulong result64 = start;
result64 += XXH3_mix2Accs(_accumulator[0], _accumulator[1], 0xad21f72c, 0x6dd4de1c);
result64 += XXH3_mix2Accs(_accumulator[2], _accumulator[3], 0x979083e9, 0xa44072db);
result64 += XXH3_mix2Accs(_accumulator[4], _accumulator[5], 0x67b3b7a4, 0xe679cb1f);
result64 += XXH3_mix2Accs(_accumulator[6], _accumulator[7], 0xe5c0cc4e, 0xd05a8278);
return XXH3_avalanche(result64);
}
private ulong XXH3_mergeAccs_high(ulong start)
{
// inlined version of XXH3_mergeAccs with SecretUIntHigh
ulong result64 = start;
result64 += XXH3_mix2Accs(_accumulator[0], _accumulator[1], 0xd349daf0, 0x29265516);
result64 += XXH3_mix2Accs(_accumulator[2], _accumulator[3], 0x2b9e68d4, 0x7d58be16);
result64 += XXH3_mix2Accs(_accumulator[4], _accumulator[5], 0x8ffca147, 0x7ad1b8f8);
result64 += XXH3_mix2Accs(_accumulator[6], _accumulator[7], 0x45ce31d0, 0x958f3acb);
return XXH3_avalanche(result64);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong XXH3_avalanche(ulong h64)
{
h64 ^= h64 >> 37;
h64 *= PRIME64_3;
h64 ^= h64 >> 32;
return h64;
}
// ReSharper restore InconsistentNaming
}
}
// License BSD-2 Clause - Copyright (c) 2019-present, Alexandre Mutel
// Derived work from: https://github.com/Cyan4973/xxHash/blob/dev/xxh3.h
// xxHash - Extremely Fast Hash algorithm
// Development source file for `xxh3`
// Copyright (C) 2019-present, Yann Collet.
// BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace StarkPlatform.NativeCompiler.Utils
{
/// <summary>
/// An incremental 128-bits hash derived from XXH3. SSE version of <see cref="XXH3Fast"/> but SSE is only used in the final compute hash.
/// </summary>
/// <remarks>
/// This is a derived implementation from XXH3.
/// The main difference with the original algorithm is that we expect the working set
/// of data to be small (&lt; 1024 bytes), so we are basically rewriting the function
/// XXH3_accumulate_512
/// https://github.com/Cyan4973/xxHash/blob/77fd98f6b57c6c17ec43606c74fa8cb96eb17dc0/xxh3.h#L663-L684
/// to work per integer instead and we are only using 32 bytes for the accumulator (instead of 64 bytes)
/// as in the original XXH3
/// </remarks>
[StructLayout(LayoutKind.Explicit)]
public unsafe struct XXH3FastSSE
{
[FieldOffset(0)]
private Vector128<uint> _a;
[FieldOffset(16)]
private Vector128<uint> _b;
[FieldOffset(0)]
private fixed uint _accumulator[ACCUMULATOR_COUNT];
[FieldOffset(0)]
private fixed byte _accumulatorBytes[ACCUMULATOR_SIZE];
[FieldOffset(32)]
private int _secretOffset;
[FieldOffset(36)]
private int _accOffset;
/// <summary>
/// Write an integer value to the hash
/// </summary>
/// <param name="data">The integer data to add to the hash</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Write(uint data)
{
ref var localSecret = ref Unsafe.AddByteOffset(ref SecretUInt, new IntPtr(_secretOffset));
_secretOffset = (_secretOffset + sizeof(uint)) & (XXH_SECRET_DEFAULT_SIZE - 1);
var key1 = localSecret ^ data;
Unsafe.As<byte, uint>(ref _accumulatorBytes[_accOffset]) += (ushort)key1 * (key1 >> 16) + data;
_accOffset = (_accOffset + sizeof(uint)) & (ACCUMULATOR_SIZE - 1);
}
/// <summary>
/// Computes the final hash from previous written data.
/// </summary>
/// <param name="count">The number of <see cref="Write"/> issued</param>
/// <returns>A hash as a 128-bits</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Vector128<ulong> Compute(uint count)
{
return Vector128.Create(XXH3_mergeAccs_low(count * PRIME64_1), XXH3_mergeAccs_high(~(count * PRIME64_2)));
}
// ReSharper disable InconsistentNaming
private const ulong PRIME64_1 = 11400714785074694791UL;
private const ulong PRIME64_2 = 14029467366897019727UL;
private const ulong PRIME64_3 = 1609587929392839161UL;
private const int ACCUMULATOR_COUNT = 8;
private const int ACCUMULATOR_SIZE = ACCUMULATOR_COUNT * sizeof(uint);
private const int XXH_SECRET_DEFAULT_SIZE = 192;
private static ReadOnlySpan<byte> SecretAsBytes => new ReadOnlySpan<byte>(new byte[XXH_SECRET_DEFAULT_SIZE] {
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
});
private static ref uint SecretUInt
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes.GetPinnableReference()));
}
private static ref uint SecretUIntLow
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[11]));
}
private static ref uint SecretUIntLow128
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[11 + 16]));
}
private static ref uint SecretUIntHigh
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[XXH_SECRET_DEFAULT_SIZE - ACCUMULATOR_SIZE - 11]));
}
private static ref uint SecretUIntHigh128
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => ref Unsafe.As<byte, uint>(ref Unsafe.AsRef(SecretAsBytes[XXH_SECRET_DEFAULT_SIZE - ACCUMULATOR_SIZE - 11 + 16]));
}
private ulong XXH3_mergeAccs_low(ulong start)
{
var a = Sse2.Xor(_a, Sse2.LoadVector128((uint*)Unsafe.AsPointer(ref SecretUIntLow)));
var lefta = Sse2.Multiply(a, Vector128.AsUInt32(Sse2.Shuffle(Vector128.AsSingle(a), Vector128.AsSingle(a), 0b_10_11_00_01)));
var b = Sse2.Xor(_b, Sse2.LoadVector128((uint*)Unsafe.AsPointer(ref SecretUIntLow128)));
var leftb = Sse2.Multiply(b, Vector128.AsUInt32(Sse2.Shuffle(Vector128.AsSingle(b), Vector128.AsSingle(b), 0b_10_11_00_01)));
var result = Sse2.Add(lefta, leftb);
var result64 = Sse41.X64.Extract(Sse2.Add(result, Vector128.AsUInt64(Sse2.Shuffle(Vector128.AsSingle(result), Vector128.AsSingle(result), 0b_00_00_11_10))), 0);
return XXH3_avalanche(result64 + start);
}
private ulong XXH3_mergeAccs_high(ulong start)
{
var a = Sse2.Xor(_a, Sse2.LoadVector128((uint*)Unsafe.AsPointer(ref SecretUIntHigh)));
var lefta = Sse2.Multiply(a, Vector128.AsUInt32(Sse2.Shuffle(Vector128.AsSingle(a), Vector128.AsSingle(a), 0b_10_11_00_01)));
var b = Sse2.Xor(_b, Sse2.LoadVector128((uint*)Unsafe.AsPointer(ref SecretUIntHigh128)));
var leftb = Sse2.Multiply(b, Vector128.AsUInt32(Sse2.Shuffle(Vector128.AsSingle(b), Vector128.AsSingle(b), 0b_10_11_00_01)));
var result = Sse2.Add(lefta, leftb);
var result64 = Sse41.X64.Extract(Sse2.Add(result, Vector128.AsUInt64(Sse2.Shuffle(Vector128.AsSingle(result), Vector128.AsSingle(result), 0b_00_00_11_10))), 0);
return XXH3_avalanche(result64 + start);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong XXH3_avalanche(ulong h64)
{
h64 ^= h64 >> 37;
h64 *= PRIME64_3;
h64 ^= h64 >> 32;
return h64;
}
// ReSharper restore InconsistentNaming
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment