Skip to content

Instantly share code, notes, and snippets.

@mjsabby
Last active August 15, 2023 04:59
Show Gist options
  • Save mjsabby/5b04f2d05b7922136ba8cf2f218aa6df to your computer and use it in GitHub Desktop.
Save mjsabby/5b04f2d05b7922136ba8cf2f218aa6df to your computer and use it in GitHub Desktop.
Parallel XxHash32 in C# using AVX2
namespace ParallelXxHash32
{
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using static Extensions;
internal static class Extensions
{
/// <summary>
/// Expects 8 32-bit integers input and space for 8 32-bit integer output
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
public unsafe static void ParallelXxHash32Avx2(uint* input, uint* output)
{
const uint PRIME32_2 = 2246822519U;
const uint PRIME32_3 = 3266489917U;
const uint PRIME32_4 = 668265263U;
const uint PRIME32_5 = 374761393U;
var h = Vector256.Create(PRIME32_5);
h = Vector256.Add(h, Vector256.Create(4U));
var v = Vector256.Load(input);
h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));
Vector256.Store(h, output);
}
/// <summary>
/// Expects 8 32-bit integers input and space for 8 32-bit integer output
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
public unsafe static void ParallelXxHash32Arm(uint* input, uint* output)
{
XxHash32.TryHash
const uint PRIME32_2 = 2246822519U;
const uint PRIME32_3 = 3266489917U;
const uint PRIME32_4 = 668265263U;
const uint PRIME32_5 = 374761393U;
var h = Vector256.Create(PRIME32_5);
h = Vector256.Add(h, Vector256.Create(4U));
var v = Vector256.Load(input);
h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));
Vector256.Store(h, output);
}
}
[DisassemblyDiagnoser]
public unsafe class Benchmarks
{
private readonly uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
private uint[] result = new uint[8];
[Benchmark]
public unsafe void Test()
{
fixed (uint* ptr = &arr[0])
fixed (uint* resultPtr = &result[0])
{
ParallelXxHash32(ptr, resultPtr);
}
}
}
internal static class Program
{
public static unsafe void Main(string[] args)
{
var sw = new Stopwatch();
sw.Start();
sw.Stop();
sw.Reset();
sw.Start();
uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
uint total = 0;
uint[] result = new uint[8];
fixed (uint* keys = &arr[0])
fixed (uint* resultPtr = &result[0])
{
total = MiniBenchmark(keys, resultPtr);
}
sw.Stop();
Console.WriteLine($"{total} in {sw.ElapsedMilliseconds}ms");
var summary = BenchmarkRunner.Run<Benchmarks>();
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static unsafe uint MiniBenchmark(uint* keys, uint* resultPtr)
{
uint total = 0;
for (int i = 0; i < 999999999; i++)
{
ParallelXxHash32(keys, resultPtr);
total += resultPtr[0];
}
return total;
}
}
}
@mjsabby
Copy link
Author

mjsabby commented Aug 15, 2023

#include
#include
#include "immintrin.h"

__declspec(noinline) void parallel(const uint32_t* keys, uint32_t res[8])
{
const uint32_t SEED = 0x3afc8e77;
const uint32_t PRIME32_2 = 2246822519U;
const uint32_t PRIME32_3 = 3266489917U;
const uint32_t PRIME32_4 = 668265263U;
const uint32_t PRIME32_5 = 374761393U;

__m256i h = _mm256_set1_epi32(SEED + PRIME32_5);
h = _mm256_add_epi32(h, _mm256_set1_epi32(4U));

__m256i v = _mm256_loadu_si256((__m256i*)(keys));
h = _mm256_add_epi32(h, _mm256_mullo_epi32(v, _mm256_set1_epi32(PRIME32_3)));
h = _mm256_mullo_epi32(_mm256_or_si256(_mm256_slli_epi32(h, 17), _mm256_srli_epi32(h, 32 - 17)), _mm256_set1_epi32(PRIME32_4));

h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15));
h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2));
h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3));
h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));

_mm256_storeu_si256((__m256i*) res, h);

}

__declspec(noinline) uint32_t benchmark(const uint32_t* keys, uint32_t res[8])
{
uint32_t total = 0;

for (int i = 0; i < 999999999; ++i)
{
    parallel(keys, res);
    total += res[0];
}

return total;

}

int main()
{
uint32_t lcols[][8] = { 1, 4, 7, 10, 13, 16, 19, 22 };
uint32_t total = 0;
uint32_t res[8];
auto t1 = std::chrono::high_resolution_clock::now();
total = benchmark(lcols[0], res);
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms_double = t2 - t1;
std::cout << total << " in " << ms_double.count() << "ms\n";
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment