Last active
August 15, 2023 04:59
-
-
Save mjsabby/5b04f2d05b7922136ba8cf2f218aa6df to your computer and use it in GitHub Desktop.
Parallel XxHash32 in C# using AVX2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace ParallelXxHash32 | |
{ | |
using System; | |
using System.Diagnostics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Running; | |
using static Extensions; | |
internal static class Extensions | |
{ | |
/// <summary> | |
/// Expects 8 32-bit integers input and space for 8 32-bit integer output | |
/// </summary> | |
[MethodImpl(MethodImplOptions.NoInlining)] | |
public unsafe static void ParallelXxHash32Avx2(uint* input, uint* output) | |
{ | |
const uint PRIME32_2 = 2246822519U; | |
const uint PRIME32_3 = 3266489917U; | |
const uint PRIME32_4 = 668265263U; | |
const uint PRIME32_5 = 374761393U; | |
var h = Vector256.Create(PRIME32_5); | |
h = Vector256.Add(h, Vector256.Create(4U)); | |
var v = Vector256.Load(input); | |
h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3))); | |
h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15)); | |
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13)); | |
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16)); | |
Vector256.Store(h, output); | |
} | |
/// <summary> | |
/// Expects 8 32-bit integers input and space for 8 32-bit integer output | |
/// </summary> | |
[MethodImpl(MethodImplOptions.NoInlining)] | |
public unsafe static void ParallelXxHash32Arm(uint* input, uint* output) | |
{ | |
XxHash32.TryHash | |
const uint PRIME32_2 = 2246822519U; | |
const uint PRIME32_3 = 3266489917U; | |
const uint PRIME32_4 = 668265263U; | |
const uint PRIME32_5 = 374761393U; | |
var h = Vector256.Create(PRIME32_5); | |
h = Vector256.Add(h, Vector256.Create(4U)); | |
var v = Vector256.Load(input); | |
h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3))); | |
h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15)); | |
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13)); | |
h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3)); | |
h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16)); | |
Vector256.Store(h, output); | |
} | |
} | |
[DisassemblyDiagnoser] | |
public unsafe class Benchmarks | |
{ | |
private readonly uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 }; | |
private uint[] result = new uint[8]; | |
[Benchmark] | |
public unsafe void Test() | |
{ | |
fixed (uint* ptr = &arr[0]) | |
fixed (uint* resultPtr = &result[0]) | |
{ | |
ParallelXxHash32(ptr, resultPtr); | |
} | |
} | |
} | |
internal static class Program | |
{ | |
public static unsafe void Main(string[] args) | |
{ | |
var sw = new Stopwatch(); | |
sw.Start(); | |
sw.Stop(); | |
sw.Reset(); | |
sw.Start(); | |
uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 }; | |
uint total = 0; | |
uint[] result = new uint[8]; | |
fixed (uint* keys = &arr[0]) | |
fixed (uint* resultPtr = &result[0]) | |
{ | |
total = MiniBenchmark(keys, resultPtr); | |
} | |
sw.Stop(); | |
Console.WriteLine($"{total} in {sw.ElapsedMilliseconds}ms"); | |
var summary = BenchmarkRunner.Run<Benchmarks>(); | |
} | |
[MethodImpl(MethodImplOptions.NoInlining)] | |
private static unsafe uint MiniBenchmark(uint* keys, uint* resultPtr) | |
{ | |
uint total = 0; | |
for (int i = 0; i < 999999999; i++) | |
{ | |
ParallelXxHash32(keys, resultPtr); | |
total += resultPtr[0]; | |
} | |
return total; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
#include
#include
#include "immintrin.h"
__declspec(noinline) void parallel(const uint32_t* keys, uint32_t res[8])
{
const uint32_t SEED = 0x3afc8e77;
const uint32_t PRIME32_2 = 2246822519U;
const uint32_t PRIME32_3 = 3266489917U;
const uint32_t PRIME32_4 = 668265263U;
const uint32_t PRIME32_5 = 374761393U;
}
__declspec(noinline) uint32_t benchmark(const uint32_t* keys, uint32_t res[8])
{
uint32_t total = 0;
}
int main()
{
uint32_t lcols[][8] = { 1, 4, 7, 10, 13, 16, 19, 22 };
uint32_t total = 0;
uint32_t res[8];
auto t1 = std::chrono::high_resolution_clock::now();
total = benchmark(lcols[0], res);
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms_double = t2 - t1;
std::cout << total << " in " << ms_double.count() << "ms\n";
}