cocowalla/Maths.cs

## Maths.cs
// Non-intrinsics path is based on https://stackoverflow.com/a/51587262/25758, which is faster than any other portable 64-bit multiplication function I've found

public static class Maths
{
    /// <summary>
    /// Multiplies 2 unsigned 64-bit integers, returning the result in 2 ulongs representing the hi and lo bits
    /// of the resulting 128-bit integer
    /// </summary>
    /// <remarks>
    /// <seealso cref="System.Numerics.BigInteger"/> can perform multiplication on large integers, but it's
    /// comparatively slow, and an equivalent method allocates around 360B/call
    /// </remarks>
    /// <param name="x">First 64-bit integer</param>
    /// <param name="y">Second 64-bit integer</param>
    /// <returns>Product of <paramref name="x"/> and <paramref name="y"/></returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static (ulong Hi, ulong Lo) Multiply64(ulong x, ulong y)
    {
        ulong hi;
        ulong lo;

// Use BMI2 intrinsics where available
#if NETCOREAPP3_0
        if (System.Runtime.Intrinsics.X86.Bmi2.X64.IsSupported)
        {
            hi = System.Runtime.Intrinsics.X86.Bmi2.X64.MultiplyNoFlags(x, y, &lo);
            return (hi, lo);
        }
#endif

        lo = x * y;

        ulong x0 = (uint)x;
        ulong x1 = x >> 32;

        ulong y0 = (uint)y;
        ulong y1 = y >> 32;

        ulong p11 = x1 * y1;
        ulong p01 = x0 * y1;
        ulong p10 = x1 * y0;
        ulong p00 = x0 * y0;

        // 64-bit product + two 32-bit values
        ulong middle = p10 + (p00 >> 32) + (uint)p01;

        // 64-bit product + two 32-bit values
        hi = p11 + (middle >> 32) + (p01 >> 32);

        return (hi, lo);
    }
}
	// Non-intrinsics path is based on https://stackoverflow.com/a/51587262/25758, which is faster than any other portable 64-bit multiplication function I've found

	public static class Maths
	{
	/// <summary>
	/// Multiplies 2 unsigned 64-bit integers, returning the result in 2 ulongs representing the hi and lo bits
	/// of the resulting 128-bit integer
	/// </summary>
	/// <remarks>
	/// <seealso cref="System.Numerics.BigInteger"/> can perform multiplication on large integers, but it's
	/// comparatively slow, and an equivalent method allocates around 360B/call
	/// </remarks>
	/// <param name="x">First 64-bit integer</param>
	/// <param name="y">Second 64-bit integer</param>
	/// <returns>Product of <paramref name="x"/> and <paramref name="y"/></returns>
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public static (ulong Hi, ulong Lo) Multiply64(ulong x, ulong y)
	{
	ulong hi;
	ulong lo;

	// Use BMI2 intrinsics where available
	#if NETCOREAPP3_0
	if (System.Runtime.Intrinsics.X86.Bmi2.X64.IsSupported)
	{
	hi = System.Runtime.Intrinsics.X86.Bmi2.X64.MultiplyNoFlags(x, y, &lo);
	return (hi, lo);
	}
	#endif

	lo = x * y;

	ulong x0 = (uint)x;
	ulong x1 = x >> 32;

	ulong y0 = (uint)y;
	ulong y1 = y >> 32;

	ulong p11 = x1 * y1;
	ulong p01 = x0 * y1;
	ulong p10 = x1 * y0;
	ulong p00 = x0 * y0;

	// 64-bit product + two 32-bit values
	ulong middle = p10 + (p00 >> 32) + (uint)p01;

	// 64-bit product + two 32-bit values
	hi = p11 + (middle >> 32) + (p01 >> 32);

	return (hi, lo);
	}
	}