mjsabby/ParallelXxHash32.cs

## ParallelXxHash32.cs
namespace ParallelXxHash32
{
    using System;
    using System.Diagnostics;
    using System.Runtime.CompilerServices;
    using System.Runtime.Intrinsics;
    using System.Runtime.Intrinsics.X86;
    using BenchmarkDotNet.Attributes;
    using BenchmarkDotNet.Running;
    using static Extensions;

    internal static class Extensions
    {
        /// <summary>
        /// Expects 8 32-bit integers input and space for 8 32-bit integer output
        /// </summary>
        [MethodImpl(MethodImplOptions.NoInlining)]
        public unsafe static void ParallelXxHash32Avx2(uint* input, uint* output)
        {
            const uint PRIME32_2 = 2246822519U;
            const uint PRIME32_3 = 3266489917U;
            const uint PRIME32_4 = 668265263U;
            const uint PRIME32_5 = 374761393U;

            var h = Vector256.Create(PRIME32_5);
            h = Vector256.Add(h, Vector256.Create(4U));

            var v = Vector256.Load(input);
            h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
            h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));

            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
            h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
            h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));

            Vector256.Store(h, output);
        }

        /// <summary>
        /// Expects 8 32-bit integers input and space for 8 32-bit integer output
        /// </summary>
        [MethodImpl(MethodImplOptions.NoInlining)]
        public unsafe static void ParallelXxHash32Arm(uint* input, uint* output)
        {
            XxHash32.TryHash


            const uint PRIME32_2 = 2246822519U;
            const uint PRIME32_3 = 3266489917U;
            const uint PRIME32_4 = 668265263U;
            const uint PRIME32_5 = 374761393U;

            var h = Vector256.Create(PRIME32_5);
            h = Vector256.Add(h, Vector256.Create(4U));

            var v = Vector256.Load(input);
            h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
            h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));

            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
            h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
            h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
            h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));

            Vector256.Store(h, output);
        }
    }

    [DisassemblyDiagnoser]
    public unsafe class Benchmarks
    {
        private readonly uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
        private uint[] result = new uint[8];

        [Benchmark]
        public unsafe void Test()
        {
            fixed (uint* ptr = &arr[0])
            fixed (uint* resultPtr = &result[0])
            {
                ParallelXxHash32(ptr, resultPtr);
            }
        }
    }

    internal static class Program
    {
        public static unsafe void Main(string[] args)
        {
            var sw = new Stopwatch();

            sw.Start();
            sw.Stop();
            sw.Reset();
            sw.Start();

            uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
            uint total = 0;
            uint[] result = new uint[8];

            fixed (uint* keys = &arr[0])
            fixed (uint* resultPtr = &result[0])
            {
                total = MiniBenchmark(keys, resultPtr);
            }

            sw.Stop();

            Console.WriteLine($"{total} in {sw.ElapsedMilliseconds}ms");

            var summary = BenchmarkRunner.Run<Benchmarks>();
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        private static unsafe uint MiniBenchmark(uint* keys, uint* resultPtr)
        {
            uint total = 0;
            for (int i = 0; i < 999999999; i++)
            {
                ParallelXxHash32(keys, resultPtr);
                total += resultPtr[0];
            }

            return total;
        }
    }
}
	namespace ParallelXxHash32
	{
	using System;
	using System.Diagnostics;
	using System.Runtime.CompilerServices;
	using System.Runtime.Intrinsics;
	using System.Runtime.Intrinsics.X86;
	using BenchmarkDotNet.Attributes;
	using BenchmarkDotNet.Running;
	using static Extensions;

	internal static class Extensions
	{
	/// <summary>
	/// Expects 8 32-bit integers input and space for 8 32-bit integer output
	/// </summary>
	[MethodImpl(MethodImplOptions.NoInlining)]
	public unsafe static void ParallelXxHash32Avx2(uint* input, uint* output)
	{
	const uint PRIME32_2 = 2246822519U;
	const uint PRIME32_3 = 3266489917U;
	const uint PRIME32_4 = 668265263U;
	const uint PRIME32_5 = 374761393U;

	var h = Vector256.Create(PRIME32_5);
	h = Vector256.Add(h, Vector256.Create(4U));

	var v = Vector256.Load(input);
	h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
	h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));

	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
	h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
	h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));

	Vector256.Store(h, output);
	}

	/// <summary>
	/// Expects 8 32-bit integers input and space for 8 32-bit integer output
	/// </summary>
	[MethodImpl(MethodImplOptions.NoInlining)]
	public unsafe static void ParallelXxHash32Arm(uint* input, uint* output)
	{
	XxHash32.TryHash


	const uint PRIME32_2 = 2246822519U;
	const uint PRIME32_3 = 3266489917U;
	const uint PRIME32_4 = 668265263U;
	const uint PRIME32_5 = 374761393U;

	var h = Vector256.Create(PRIME32_5);
	h = Vector256.Add(h, Vector256.Create(4U));

	var v = Vector256.Load(input);
	h = Vector256.Add(h, Avx2.MultiplyLow(v, Vector256.Create(PRIME32_3)));
	h = Avx2.MultiplyLow(Vector256.BitwiseOr(Vector256.ShiftLeft(h, 17), Vector256.ShiftRightLogical(h, 32 - 17)), Vector256.Create(PRIME32_4));

	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 15));
	h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_2));
	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 13));
	h = Avx2.MultiplyLow(h, Vector256.Create(PRIME32_3));
	h = Vector256.Xor(h, Vector256.ShiftRightLogical(h, 16));

	Vector256.Store(h, output);
	}
	}

	[DisassemblyDiagnoser]
	public unsafe class Benchmarks
	{
	private readonly uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
	private uint[] result = new uint[8];

	[Benchmark]
	public unsafe void Test()
	{
	fixed (uint* ptr = &arr[0])
	fixed (uint* resultPtr = &result[0])
	{
	ParallelXxHash32(ptr, resultPtr);
	}
	}
	}

	internal static class Program
	{
	public static unsafe void Main(string[] args)
	{
	var sw = new Stopwatch();

	sw.Start();
	sw.Stop();
	sw.Reset();
	sw.Start();

	uint[] arr = { 1, 4, 7, 10, 13, 16, 19, 22 };
	uint total = 0;
	uint[] result = new uint[8];

	fixed (uint* keys = &arr[0])
	fixed (uint* resultPtr = &result[0])
	{
	total = MiniBenchmark(keys, resultPtr);
	}

	sw.Stop();

	Console.WriteLine($"{total} in {sw.ElapsedMilliseconds}ms");

	var summary = BenchmarkRunner.Run<Benchmarks>();
	}

	[MethodImpl(MethodImplOptions.NoInlining)]
	private static unsafe uint MiniBenchmark(uint* keys, uint* resultPtr)
	{
	uint total = 0;
	for (int i = 0; i < 999999999; i++)
	{
	ParallelXxHash32(keys, resultPtr);
	total += resultPtr[0];
	}

	return total;
	}
	}
	}