Skip to content

Instantly share code, notes, and snippets.

@gfoidl
Last active October 17, 2022 11:02
Show Gist options
  • Save gfoidl/c8e883c9432d994a4a4d7c30b501aef1 to your computer and use it in GitHub Desktop.
Save gfoidl/c8e883c9432d994a4a4d7c30b501aef1 to your computer and use it in GitHub Desktop.
AVX2 random
// Cf. https://github.com/BenjaminAbt/SustainableCode/tree/main/csharp/random-string
// And see also https://github.com/BenjaminAbt/SustainableCode/blob/main/csharp/random-string-vector
//#define BENCH
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
Benchmark bench = new()
{
CharLength = 82
};
#if DEBUG
Console.WriteLine(bench.StringCreate());
Console.WriteLine();
for (int i = 0; i < 10; ++i)
{
Console.WriteLine(bench.Vectorized());
}
#else
#if BENCH
BenchmarkDotNet.Running.BenchmarkRunner.Run<Benchmark>();
#else
for (int i = 0; i < 100; ++i)
{
if (i % 10 == 0) Thread.Sleep(100);
_ = bench.Vectorized();
}
#endif
#endif
public class Benchmark
{
[Params(10, 100, 1000)]
public int CharLength { get; set; } = 100;
[Benchmark(Baseline = true)]
public string StringCreate() => StringCreateSample.CreateRandomString(CharLength);
[Benchmark]
public string Vectorized() => VectorSample.CreateRandomString(CharLength);
}
public static class SampleConstants
{
public const string UpperChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
public const string LowerChars = "abcdefghijklmnopqrstuvwxyz";
public const string Digits = "0123456789";
public const string AlphNum = UpperChars + LowerChars + Digits;
}
public static class StringCreateSample
{
private static readonly Random s_random = new(0);
private static void CreateRandomString(Span<char> buffer)
{
const string Chars = SampleConstants.AlphNum;
int charsLength = Chars.Length;
for (int i = 0; i < buffer.Length; ++i)
{
int cl = charsLength;
buffer[i] = Chars[s_random.Next(cl)];
}
}
public static string CreateRandomString(int length)
{
return string.Create<object?>(length, null, static (buffer, _) => CreateRandomString(buffer));
}
}
public static class VectorSample
{
private static readonly Random s_random = new(0);
public static string CreateRandomString(int length)
{
return string.Create<object?>(length, null, static (buffer, _) => CreateRandomString(buffer));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void CreateRandomString(Span<char> buffer)
{
if (Avx2.IsSupported && buffer.Length >= 2 * Vector256<ushort>.Count)
{
#if DEBUG
// To aid detect writing beyond the allowed range -- see below.
buffer[^1] = '=';
#endif
// For JIT-TC to kick in no stackalloc must occur.
// Thus it's hoisted to here. And that's why the scalar path is moved
// into its own method.
byte* seedChars = stackalloc byte[64];
CreateRandomStringVectorized(buffer, seedChars);
}
else
{
CreateRandomStringScalar(buffer);
}
}
private static void CreateRandomStringScalar(Span<char> buffer)
{
const string Chars = SampleConstants.AlphNum;
int charsLength = Chars.Length;
for (int i = 0; i < buffer.Length; ++i)
{
buffer[i] = Chars[s_random.Next(charsLength)];
}
}
[SkipLocalsInit]
private static unsafe void CreateRandomStringVectorized(Span<char> buffer, byte* seedChars)
{
const string Chars = SampleConstants.AlphNum;
Debug.Assert(Chars.Length == 62);
Vector256<int> seed = Vector256.Create(
Random.Shared.NextInt64(),
Random.Shared.NextInt64(),
Random.Shared.NextInt64(),
Random.Shared.NextInt64()
).AsInt32();
// seedChars could also be given as ROS<byte>, depending on use case.
// Especially with C# 11's UTF-8 literals, e.g. "ABCD..."u8
ref ushort chars = ref Unsafe.As<char, ushort>(ref Unsafe.AsRef(Chars.GetPinnableReference()));
PackToBytes(ref chars, seedChars);
Vector256<byte> seedVec0 = Vector256.Load(seedChars);
Vector256<byte> seedVec1 = Vector256.Load(seedChars + Vector256<byte>.Count);
Vector256<float> upperForVec = Vector256.Create((float)(Vector256<byte>.Count - 1));
Vector256<float> one = Vector256.Create(1f);
Vector256<int> mantissaMask = Vector256.Create(0x7FFFFF);
ref ushort dest = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(buffer));
ref ushort twoVectorsAwayFromEnd = ref Unsafe.Add(ref dest, (uint)(buffer.Length - 2 * Vector256<ushort>.Count));
do
{
Core(ref dest, seedVec0, seedVec1, ref seed, mantissaMask, one, upperForVec);
dest = ref Unsafe.Add(ref dest, 2 * Vector256<ushort>.Count);
}
while (Unsafe.IsAddressLessThan(ref dest, ref twoVectorsAwayFromEnd));
Core(ref twoVectorsAwayFromEnd, seedVec0, seedVec1, ref seed, mantissaMask, one, upperForVec);
//---------------------------------------------------------------------
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void PackToBytes(ref ushort chars, byte* seed)
{
ref short charsAsInt16 = ref Unsafe.As<ushort, short>(ref chars);
#if DEBUG
// Clear the seed (len = 32, i.e. Vector256<byte> size)
Vector256<byte>.Zero.Store(seed);
// To aid detect writing beyond the allowed range -- see below.
seed[62] = (byte)'=';
seed[63] = (byte)'=';
#endif
// We read 32 chars, pack them to 32 bytes
// Then 30 chars remain
//
// Use hw-intrinsics as they don't perform additional AND like Vector256.Narrow does
Vector256<byte> narrowed256 = Avx2.PackUnsignedSaturate(
Vector256.LoadUnsafe(ref charsAsInt16),
Vector256.LoadUnsafe(ref charsAsInt16, (uint)Vector256<ushort>.Count));
narrowed256 = Avx2.Permute4x64(narrowed256.AsInt64(), 0b_11_01_10_00).AsByte();
narrowed256.Store(seed);
nuint offset = 2 * (uint)Vector256<ushort>.Count;
// We read 16 chars, pack them to 16 bytes
// Then 14 chars remain
Vector128<byte> narrowed128 = Sse2.PackUnsignedSaturate(
Vector128.LoadUnsafe(ref charsAsInt16, offset),
Vector128.LoadUnsafe(ref charsAsInt16, offset + (uint)Vector128<ushort>.Count));
narrowed128.Store(seed + Vector256<byte>.Count);
// For the remaining 14 chars we read 16 chars from the end, as the operation is idempotent.
offset = 62 - 2 * (uint)Vector128<ushort>.Count;
narrowed128 = Sse2.PackUnsignedSaturate(
Vector128.LoadUnsafe(ref charsAsInt16, offset),
Vector128.LoadUnsafe(ref charsAsInt16, offset + (uint)Vector128<ushort>.Count));
narrowed128.Store(seed + offset);
#if DEBUG
Debug.Assert(seed[62] == (byte)'=');
Debug.Assert(seed[63] == (byte)'=');
#endif
// The 62 Chars are narrowed to 62 bytes, so add another two random bytes (chars)
// so the whole range of 64 bytes can be used. In regards to entropy it would be
// better to leave them off, as 2/62 are more likely this way. But for speed it's
// better.
seed[62] = (byte)Unsafe.Add(ref chars, Random.Shared.Next(62));
seed[63] = (byte)Unsafe.Add(ref chars, Random.Shared.Next(62));
}
//---------------------------------------------------------------------
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void Core(
ref ushort dest,
Vector256<byte> seedVec0,
Vector256<byte> seedVec1,
ref Vector256<int> seed,
Vector256<int> mantissaMask,
Vector256<float> one,
Vector256<float> upperForVector)
{
Vector256<byte> shuffleMask = NextRandomByteVector(ref seed, mantissaMask, one, upperForVector);
// seedVec0 seedVec1
// ABCDEFGHIJKLMNOP QRSTUVWXYZabcdef ghijklmnopqrstuv wxyz0123456789<>
// vec0 vec1
// ANBGJCKHNKIMKLDE ZUVXWRVSUbQWZVZR gthmpiqntqosqrjk 50132x1y07w2515x
Vector256<byte> vec0 = Avx2.Shuffle(seedVec0, shuffleMask);
Vector256<byte> vec1 = Avx2.Shuffle(seedVec1, shuffleMask);
Vector256<int> permuteMask = NextRandomVector(ref seed, mantissaMask, one, upperForVector);
// vec0 vec1
// before: ANBG JCKH NKIM KLDE ZUVX WRVS UbQW ZVZR gthm piqn tqos qrjk 5013 2x1y 07w2 515x
// after: WRVS ZVZR JCKH KLDE KLDE ZVZR KLDE NKIM 2x1y 515x piqn qrjk qrjk 515x qrjk tqos
vec0 = Avx2.PermuteVar8x32(vec0.AsInt32(), permuteMask).AsByte();
vec1 = Avx2.PermuteVar8x32(vec1.AsInt32(), permuteMask).AsByte();
// after blend: 2RVyZ15RJiqnqLDkKrDE5VZxqLjkNKIM
Vector256<byte> blendMask = Vector256.Equals(shuffleMask & Vector256.Create((byte)1), Vector256<byte>.Zero);
Vector256<byte> res = Avx2.BlendVariable(vec0, vec1, blendMask);
(Vector256<ushort> lower, Vector256<ushort> upper) = Vector256.Widen(res);
lower.StoreUnsafe(ref dest);
upper.StoreUnsafe(ref dest, (uint)Vector256<ushort>.Count);
}
//---------------------------------------------------------------------
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector256<byte> NextRandomByteVector(
ref Vector256<int> seed,
Vector256<int> mantissaMask,
Vector256<float> one,
Vector256<float> upper)
{
Vector256<int> rnd0 = NextRandomVector(ref seed, mantissaMask, one, upper);
Vector256<int> rnd1 = NextRandomVector(ref seed, mantissaMask, one, upper);
Vector256<int> rnd2 = NextRandomVector(ref seed, mantissaMask, one, upper);
Vector256<int> rnd3 = NextRandomVector(ref seed, mantissaMask, one, upper);
rnd1 = Vector256.ShiftLeft(rnd1, 8);
rnd2 = Vector256.ShiftLeft(rnd2, 16);
rnd3 = Vector256.ShiftLeft(rnd3, 24);
Vector256<int> rnd = (rnd0 | rnd1) | (rnd2 | rnd3);
return rnd.AsByte();
}
//---------------------------------------------------------------------
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector256<int> NextRandomVector(
ref Vector256<int> seed,
Vector256<int> mantissaMask,
Vector256<float> one,
Vector256<float> upper)
{
// Xorshift (cool how easy :-))
seed ^= Vector256.ShiftLeft(seed, 13);
seed ^= Vector256.ShiftRightArithmetic(seed, 17);
seed ^= Vector256.ShiftLeft(seed, 5);
// Convert random ints to floats out of [1, 2), cf. https://stackoverflow.com/a/70565649/347870
Vector256<int> mantissa = seed & mantissaMask;
Vector256<float> val = mantissa.AsSingle() | one;
val = Fma.MultiplySubtract(val, upper, upper); // Scale from [1, 2) to [0, upper)
Vector256<int> rnd = Vector256.ConvertToInt32(val); // Convert back to int out of [0, upper) by truncation
return rnd;
}
}
}
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
Vector256<float> one = Vector256.Create(1f);
Vector256<float> upper = Vector256.Create((float)(64 + 1));
Dictionary<int, long> histogram = new();
// Random numbers ;-)
//Vector256<int> seed = Vector256.Create(
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next(),
// Random.Shared.Next()
//);
Vector256<int> seed = Vector256.Create(
Random.Shared.NextInt64(),
Random.Shared.NextInt64(),
Random.Shared.NextInt64(),
Random.Shared.NextInt64()
).AsInt32();
Vector256<int> mantissaMask = Vector256.Create(0x7FFFFF);
for (int i = 0; i < 1_000_000; ++i)
{
Vector256<int> rnd = NextRandomVector();
Validate(rnd);
AddToHistogram(rnd);
Console.WriteLine(rnd);
Console.ReadKey();
}
PrintHistogram();
Vector256<int> NextRandomVector()
{
// Xorshift (cool how easy :-))
seed ^= Vector256.ShiftLeft(seed, 13);
seed ^= Vector256.ShiftRightArithmetic(seed, 17);
seed ^= Vector256.ShiftLeft(seed, 5);
// Convert random ints to floats out of [1, 2), cf. https://stackoverflow.com/a/70565649/347870
Vector256<int> mantissa = seed & mantissaMask;
Vector256<float> val = mantissa.AsSingle() | one;
val = Fma.MultiplySubtract(val, upper, upper); // Scale from [1, 2) to [0, upper)
Vector256<int> rnd = Vector256.ConvertToInt32(val); // Convert back to int out of [0, upper) by truncation
return rnd;
}
static void Validate(Vector256<int> rnd)
{
for (int i = 0; i < Vector256<int>.Count; ++i)
{
int val = rnd[i];
if (val < 0 || val > 64)
{
throw new Exception("Out of range");
}
}
}
void AddToHistogram(Vector256<int> rnd)
{
for (int i = 0; i < Vector256<int>.Count; ++i)
{
int val = rnd[i];
ref long count = ref CollectionsMarshal.GetValueRefOrAddDefault(histogram, val, out _);
count++;
}
}
void PrintHistogram()
{
long avg = 0;
foreach (long value in histogram.Values)
{
avg += value;
}
avg /= histogram.Count;
foreach (KeyValuePair<int, long> kvp in histogram.OrderBy(h => h.Key))
{
Console.WriteLine($"{kvp.Key,3}\t{kvp.Value,5}\t{kvp.Value - avg,5}");
}
}
@gfoidl
Copy link
Author

gfoidl commented Oct 14, 2022

To get a random byte-vector, e.g.

Vector256<int> rnd0 = NextRandomVector();
Vector256<int> rnd1 = NextRandomVector();
Vector256<int> rnd2 = NextRandomVector();
Vector256<int> rnd3 = NextRandomVector();

rnd1 = Vector256.ShiftLeft(rnd1,  8);
rnd2 = Vector256.ShiftLeft(rnd2, 16);
rnd3 = Vector256.ShiftLeft(rnd3, 24);

Vector256<byte> rnd = ((rnd0 | rnd1) | (rnd2 | rnd3)).AsByte();

@gfoidl
Copy link
Author

gfoidl commented Oct 14, 2022

Benchmark for random string:

|       Method | CharLength |        Mean |     Error |    StdDev | Ratio | RatioSD |
|------------- |----------- |------------:|----------:|----------:|------:|--------:|
| StringCreate |         10 |    151.2 ns |   3.10 ns |   5.60 ns |  1.00 |    0.00 |
|   Vectorized |         10 |    143.1 ns |   2.94 ns |   5.67 ns |  0.95 |    0.05 |
|              |            |             |           |           |       |         |
| StringCreate |        100 |  1,253.0 ns |  24.07 ns |  51.30 ns |  1.00 |    0.00 |
|   Vectorized |        100 |    149.8 ns |   3.07 ns |   8.31 ns |  0.12 |    0.01 |
|              |            |             |           |           |       |         |
| StringCreate |       1000 | 12,472.4 ns | 245.81 ns | 461.69 ns |  1.00 |    0.00 |
|   Vectorized |       1000 |    710.7 ns |  16.74 ns |  49.35 ns |  0.06 |    0.00 |
And the codegen for x64 gives nice machine code
DOTNET_JitDisasm: "VectorSample:CreateRandomStringVectorized(Span`1,long)"
; Assembly listing for method VectorSample:CreateRandomStringVectorized(Span`1,long)
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 28 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       57                   push     rdi
       56                   push     rsi
       55                   push     rbp
       53                   push     rbx
       4881EC98000000       sub      rsp, 152
       C5F877               vzeroupper
       C5F829B42480000000   vmovaps  qword ptr [rsp+80H], xmm6
       C5F8297C2470         vmovaps  qword ptr [rsp+70H], xmm7
       C57829442460         vmovaps  qword ptr [rsp+60H], xmm8
       C578294C2450         vmovaps  qword ptr [rsp+50H], xmm9
       C57829542440         vmovaps  qword ptr [rsp+40H], xmm10
       488BF9               mov      rdi, rcx
       488BF2               mov      rsi, rdx

G_M000_IG02:                ;; offset=0035H
       48B96003C04480020000 mov      rcx, 0x28044C00360
       488B19               mov      rbx, gword ptr [rcx]
       488BCB               mov      rcx, rbx
       FF1565191200         call     [ThreadSafeRandom:NextInt64():long:this]
       C4E1F96EC0           vmovd    xmm0, rax
       C5F911442430         vmovupd  xmmword ptr [rsp+30H], xmm0
       488BCB               mov      rcx, rbx
       FF1551191200         call     [ThreadSafeRandom:NextInt64():long:this]
       C5F910442430         vmovupd  xmm0, xmmword ptr [rsp+30H]
       C4E3F922C001         vpinsrq  xmm0, xmm0, rax, 1
       C5F911442430         vmovupd  xmmword ptr [rsp+30H], xmm0
       488BCB               mov      rcx, rbx
       FF1536191200         call     [ThreadSafeRandom:NextInt64():long:this]
       C4E1F96EC0           vmovd    xmm0, rax
       C5F911442420         vmovupd  xmmword ptr [rsp+20H], xmm0
       488BCB               mov      rcx, rbx
       FF1522191200         call     [ThreadSafeRandom:NextInt64():long:this]
       C5F910442420         vmovupd  xmm0, xmmword ptr [rsp+20H]
       C4E3F922C001         vpinsrq  xmm0, xmm0, rax, 1
       C5F9104C2430         vmovupd  xmm1, xmmword ptr [rsp+30H]
       C4E37518F001         vinsertf128 ymm6, ymm1, xmm0, 1
       48B9F820C04480020000 mov      rcx, 0x28044C020F8
       488B29               mov      rbp, gword ptr [rcx]
       4883C50C             add      rbp, 12
       C5FE6F4500           vmovdqu  ymm0, ymmword ptr[rbp]
       C5FD674520           vpackuswb ymm0, ymm0, ymmword ptr[rbp+20H]
       C4E3FD00C0D8         vpermq   ymm0, ymm0, -40
       C5FE7F06             vmovdqu  ymmword ptr[rsi], ymm0
       C5FA6F4540           vmovdqu  xmm0, xmmword ptr [rbp+40H]
       C5F9674550           vpackuswb xmm0, xmm0, xmmword ptr [rbp+50H]
       C5FA7F4620           vmovdqu  xmmword ptr [rsi+20H], xmm0
       C5FA6F455C           vmovdqu  xmm0, xmmword ptr [rbp+5CH]
       C5F967456C           vpackuswb xmm0, xmm0, xmmword ptr [rbp+6CH]
       C5FA7F462E           vmovdqu  xmmword ptr [rsi+2EH], xmm0
       488BCB               mov      rcx, rbx
       BA3E000000           mov      edx, 62
       C4E37D19F701         vextractf128 xmm7, ymm6, 1
       FF15A3181200         call     [ThreadSafeRandom:Next(int):int:this]
       4863C8               movsxd   rcx, eax
       0FB64C4D00           movzx    rcx, byte  ptr [rbp+2*rcx]
       884E3E               mov      byte  ptr [rsi+3EH], cl
       488BCB               mov      rcx, rbx
       BA3E000000           mov      edx, 62
       FF158A181200         call     [ThreadSafeRandom:Next(int):int:this]
       4898                 cdqe
       0FB6444500           movzx    rax, byte  ptr [rbp+2*rax]
       88463F               mov      byte  ptr [rsi+3FH], al
       C5FE6F06             vmovdqu  ymm0, ymmword ptr[rsi]
       C5FE6F4E20           vmovdqu  ymm1, ymmword ptr[rsi+20H]
       C5FD1015EF030000     vmovupd  ymm2, ymmword ptr[reloc @RWD00]
       C5FD101D07040000     vmovupd  ymm3, ymmword ptr[reloc @RWD32]
       C5FD10251F040000     vmovupd  ymm4, ymmword ptr[reloc @RWD64]
       488B07               mov      rax, bword ptr [rdi]
       8B5708               mov      edx, dword ptr [rdi+08H]

G_M000_IG03:                ;; offset=0147H
       83C2E0               add      edx, -32
       488D1450             lea      rdx, bword ptr [rax+2*rdx]
       C5FD102D2A040000     vmovupd  ymm5, ymmword ptr[reloc @RWD96]
       C4E34D18F701         vinsertf128 ymm6, ymm6, xmm7, 1
                            align    [0 bytes for IG04]

G_M000_IG04:                ;; offset=015CH
       C5C572F60D           vpslld   ymm7, ymm6, 13
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5C572E611           vpsrad   ymm7, ymm6, 17
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5C572F605           vpslld   ymm7, ymm6, 5
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5CDDBFC             vpand    ymm7, ymm6, ymm4
       C5C456FB             vorps    ymm7, ymm7, ymm3
       C4E26DAAFA           vfmsub213ps ymm7, ymm2, ymm2
       C5FE5BFF             vcvttps2dq ymm7, ymm7
       C5BD72F60D           vpslld   ymm8, ymm6, 13
       C4C14DEFF0           vpxor    ymm6, ymm6, ymm8
       C5BD72E611           vpsrad   ymm8, ymm6, 17
       C4C14DEFF0           vpxor    ymm6, ymm6, ymm8
       C5BD72F605           vpslld   ymm8, ymm6, 5
       C4C14DEFF0           vpxor    ymm6, ymm6, ymm8
       C54DDBC4             vpand    ymm8, ymm6, ymm4
       C53C56C3             vorps    ymm8, ymm8, ymm3
       C4626DAAC2           vfmsub213ps ymm8, ymm2, ymm2
       C4417E5BC0           vcvttps2dq ymm8, ymm8
       C5B572F60D           vpslld   ymm9, ymm6, 13
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C5B572E611           vpsrad   ymm9, ymm6, 17
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C5B572F605           vpslld   ymm9, ymm6, 5
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C54DDBCC             vpand    ymm9, ymm6, ymm4
       C53456CB             vorps    ymm9, ymm9, ymm3
       C4626DAACA           vfmsub213ps ymm9, ymm2, ymm2
       C4417E5BC9           vcvttps2dq ymm9, ymm9
       C5AD72F60D           vpslld   ymm10, ymm6, 13
       C4C14DEFF2           vpxor    ymm6, ymm6, ymm10
       C5AD72E611           vpsrad   ymm10, ymm6, 17
       C4C14DEFF2           vpxor    ymm6, ymm6, ymm10
       C5AD72F605           vpslld   ymm10, ymm6, 5
       C4C14DEFF2           vpxor    ymm6, ymm6, ymm10
       C54DDBD4             vpand    ymm10, ymm6, ymm4
       C52C56D3             vorps    ymm10, ymm10, ymm3
       C4626DAAD2           vfmsub213ps ymm10, ymm2, ymm2
       C4417E5BD2           vcvttps2dq ymm10, ymm10
       C4C13D72F008         vpslld   ymm8, ymm8, 8
       C4C13572F110         vpslld   ymm9, ymm9, 16
       C4C12D72F218         vpslld   ymm10, ymm10, 24
       C4C145EBF8           vpor     ymm7, ymm7, ymm8
       C44135EBC2           vpor     ymm8, ymm9, ymm10
       C4C145EBF8           vpor     ymm7, ymm7, ymm8
       C4627500C7           vpshufb  ymm8, ymm1, ymm7
       C5B572F60D           vpslld   ymm9, ymm6, 13
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C5B572E611           vpsrad   ymm9, ymm6, 17
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C5B572F605           vpslld   ymm9, ymm6, 5
       C4C14DEFF1           vpxor    ymm6, ymm6, ymm9
       C54DDBCC             vpand    ymm9, ymm6, ymm4
       C53456CB             vorps    ymm9, ymm9, ymm3
       C4626DAACA           vfmsub213ps ymm9, ymm2, ymm2
       C4417E5BC9           vcvttps2dq ymm9, ymm9
       C4423536C0           vpermd   ymm8, ymm9, ymm8
       C4627D00D7           vpshufb  ymm10, ymm0, ymm7
       C4423536CA           vpermd   ymm9, ymm9, ymm10
       C5C5DBFD             vpand    ymm7, ymm7, ymm5
       C4412C57D2           vxorps   ymm10, ymm10, ymm10
       C4C14574FA           vpcmpeqb ymm7, ymm7, ymm10
       C4C3354CF870         vpblendvb ymm7, ymm9, ymm8, ymm7
       C57C28C7             vmovaps  ymm8, ymm7
       C4427D30C0           vpmovzxbw ymm8, ymm8
       C4E37D19FF01         vextractf128 xmm7, ymm7, 1
       C4E27D30FF           vpmovzxbw ymm7, ymm7

G_M000_IG05:                ;; offset=02A5H
       C57E7F00             vmovdqu  ymmword ptr[rax], ymm8
       C5FE7F7820           vmovdqu  ymmword ptr[rax+20H], ymm7
       4883C040             add      rax, 64
       483BC2               cmp      rax, rdx
       0F82A1FEFFFF         jb       G_M000_IG04

G_M000_IG06:                ;; offset=02BBH
       C5ED72F60D           vpslld   ymm2, ymm6, 13
       C5CDEFF2             vpxor    ymm6, ymm6, ymm2
       C5ED72E611           vpsrad   ymm2, ymm6, 17
       C5CDEFF2             vpxor    ymm6, ymm6, ymm2
       C5ED72F605           vpslld   ymm2, ymm6, 5
       C5CDEFF2             vpxor    ymm6, ymm6, ymm2
       C5CDDB1582020000     vpand    ymm2, ymm6, ymmword ptr[reloc @RWD64]     ; https://github.com/dotnet/runtime/issues/76781
       C5EC56155A020000     vorps    ymm2, ymm2, ymmword ptr[reloc @RWD32]
       C5FD101D32020000     vmovupd  ymm3, ymmword ptr[reloc @RWD00]
       C4E265AA1529020000   vfmsub213ps ymm2, ymm3, ymmword ptr[reloc @RWD00]
       C5FE5BD2             vcvttps2dq ymm2, ymm2
       C5E572F60D           vpslld   ymm3, ymm6, 13
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5E572E611           vpsrad   ymm3, ymm6, 17
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5E572F605           vpslld   ymm3, ymm6, 5
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5CDDB1D42020000     vpand    ymm3, ymm6, ymmword ptr[reloc @RWD64]
       C5E4561D1A020000     vorps    ymm3, ymm3, ymmword ptr[reloc @RWD32]
       C5FD1025F2010000     vmovupd  ymm4, ymmword ptr[reloc @RWD00]
       C4E25DAA1DE9010000   vfmsub213ps ymm3, ymm4, ymmword ptr[reloc @RWD00]
       C5FE5BDB             vcvttps2dq ymm3, ymm3
       C5DD72F60D           vpslld   ymm4, ymm6, 13
       C5CDEFF4             vpxor    ymm6, ymm6, ymm4
       C5DD72E611           vpsrad   ymm4, ymm6, 17
       C5CDEFF4             vpxor    ymm6, ymm6, ymm4
       C5DD72F605           vpslld   ymm4, ymm6, 5
       C5CDEFF4             vpxor    ymm6, ymm6, ymm4
       C5CDDB2502020000     vpand    ymm4, ymm6, ymmword ptr[reloc @RWD64]
       C5DC5625DA010000     vorps    ymm4, ymm4, ymmword ptr[reloc @RWD32]
       C5FD103DB2010000     vmovupd  ymm7, ymmword ptr[reloc @RWD00]
       C4E245AA25A9010000   vfmsub213ps ymm4, ymm7, ymmword ptr[reloc @RWD00]
       C5FE5BE4             vcvttps2dq ymm4, ymm4
       C5C572F60D           vpslld   ymm7, ymm6, 13
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5C572E611           vpsrad   ymm7, ymm6, 17
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5C572F605           vpslld   ymm7, ymm6, 5
       C5CDEFF7             vpxor    ymm6, ymm6, ymm7
       C5CDDB3DC2010000     vpand    ymm7, ymm6, ymmword ptr[reloc @RWD64]
       C5C4563D9A010000     vorps    ymm7, ymm7, ymmword ptr[reloc @RWD32]
       C57D100572010000     vmovupd  ymm8, ymmword ptr[reloc @RWD00]
       C4E23DAA3D69010000   vfmsub213ps ymm7, ymm8, ymmword ptr[reloc @RWD00]
       C5FE5BFF             vcvttps2dq ymm7, ymm7
       C5E572F308           vpslld   ymm3, ymm3, 8
       C5DD72F410           vpslld   ymm4, ymm4, 16
       C5C572F718           vpslld   ymm7, ymm7, 24
       C5EDEBD3             vpor     ymm2, ymm2, ymm3
       C5DDEBDF             vpor     ymm3, ymm4, ymm7
       C5EDEBD3             vpor     ymm2, ymm2, ymm3
       C4E27500CA           vpshufb  ymm1, ymm1, ymm2
       C5E572F60D           vpslld   ymm3, ymm6, 13
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5E572E611           vpsrad   ymm3, ymm6, 17
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5E572F605           vpslld   ymm3, ymm6, 5
       C5CDEFF3             vpxor    ymm6, ymm6, ymm3
       C5CDDB1D62010000     vpand    ymm3, ymm6, ymmword ptr[reloc @RWD64]
       C5E4561D3A010000     vorps    ymm3, ymm3, ymmword ptr[reloc @RWD32]
       C5FD102512010000     vmovupd  ymm4, ymmword ptr[reloc @RWD00]
       C4E25DAA1D09010000   vfmsub213ps ymm3, ymm4, ymmword ptr[reloc @RWD00]
       C5FE5BDB             vcvttps2dq ymm3, ymm3
       C4E26536C9           vpermd   ymm1, ymm3, ymm1
       C4E27D00C2           vpshufb  ymm0, ymm0, ymm2
       C4E26536C0           vpermd   ymm0, ymm3, ymm0
       C5EDDBD5             vpand    ymm2, ymm2, ymm5

G_M000_IG07:                ;; offset=042EH
       C5E457DB             vxorps   ymm3, ymm3, ymm3
       C5ED74D3             vpcmpeqb ymm2, ymm2, ymm3
       C4E37D4CC120         vpblendvb ymm0, ymm0, ymm1, ymm2
       C5FC28C8             vmovaps  ymm1, ymm0
       C4E27D30C9           vpmovzxbw ymm1, ymm1
       C4E37D19C001         vextractf128 xmm0, ymm0, 1
       C4E27D30C0           vpmovzxbw ymm0, ymm0
       C5FE7F0A             vmovdqu  ymmword ptr[rdx], ymm1
       C5FE7F4220           vmovdqu  ymmword ptr[rdx+20H], ymm0

G_M000_IG08:                ;; offset=0459H
       C5F828B42480000000   vmovaps  xmm6, qword ptr [rsp+80H]
       C5F8287C2470         vmovaps  xmm7, qword ptr [rsp+70H]
       C57828442460         vmovaps  xmm8, qword ptr [rsp+60H]
       C578284C2450         vmovaps  xmm9, qword ptr [rsp+50H]
       C57828542440         vmovaps  xmm10, qword ptr [rsp+40H]
       C5F877               vzeroupper
       4881C498000000       add      rsp, 152
       5B                   pop      rbx
       5D                   pop      rbp
       5E                   pop      rsi
       5F                   pop      rdi
       C3                   ret

RWD00   dq      41F8000041F80000h, 41F8000041F80000h, 41F8000041F80000h, 41F8000041F80000h
RWD32   dq      3F8000003F800000h, 3F8000003F800000h, 3F8000003F800000h, 3F8000003F800000h
RWD64   dq      007FFFFF007FFFFFh, 007FFFFF007FFFFFh, 007FFFFF007FFFFFh, 007FFFFF007FFFFFh
RWD96   dq      0101010101010101h, 0101010101010101h, 0101010101010101h, 0101010101010101h

; Total bytes of code 1161

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment