Skip to content

Instantly share code, notes, and snippets.

@benaadams
Last active February 13, 2020 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benaadams/21d3bd5207314c075cc559a97f924526 to your computer and use it in GitHub Desktop.
Save benaadams/21d3bd5207314c075cc559a97f924526 to your computer and use it in GitHub Desktop.
using System;
using System.Buffers;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
using nint = System.Int64;
public class Program
{
static void Main()
{
Console.WriteLine("Input : The quick brown fox jumped over the lazy dog");
Console.WriteLine(
Conversion.AsciiToString(
Encoding.ASCII.GetBytes(
"Output: The quick brown fox jumped over the lazy dog")));
for (var i = 0; i < 127; i++)
{
var s = string.Create(i, i, (span, c) => {
for (int i = 0; i < span.Length; i++)
{
span[i] = (char)i;
}
});
var b = Encoding.ASCII.GetBytes(s);
if (Conversion.AsciiToString(b) != s)
{
Console.WriteLine($"Fail at length {i}");
return;
};
}
Console.WriteLine("Success for lengths 0 to 127 of ascending ascii chars");
}
}
public class Conversion
{
private readonly static SpanAction<char, ReadOnlyMemory<byte>> s_asciiToString
= new Conversion().AsciiToString; // Create an instance for a faster delegate
public static string AsciiToString(ReadOnlyMemory<byte> input)
=> string.Create(input.Length, input, s_asciiToString);
// Delegates to instance methods are faster than ones to static methods
private void AsciiToString(Span<char> output, ReadOnlyMemory<byte> memory)
{
// This is unsafe as non-ascii characters will _not_ be handled correctly
var input = memory.Span;
Debug.Assert(output.Length == input.Length);
ref var inputStart = ref MemoryMarshal.GetReference(input);
ref var outputStart = ref MemoryMarshal.GetReference(output);
nint offset = 0;
nint length = input.Length - Vector256<byte>.Count;
if (Avx2.IsSupported && length >= 0)
{
Vector256<byte> asciiVector, asciiVectorLow, asciiVectorHigh;
Vector256<byte> zeroVector = Vector256<byte>.Zero;
// First time this checks again against 0, however we will move into final widen if it fails.
while (length > offset)
{
asciiVector = Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.Add(ref inputStart, (IntPtr)offset));
asciiVectorLow = Avx2.Permute4x64(asciiVector.AsUInt64(), 0xd4).AsByte();
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)offset)),
Avx2.UnpackLow(asciiVectorLow, zeroVector));
asciiVectorHigh = Avx2.Permute4x64(asciiVector.AsUInt64(), 0xe8).AsByte();
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)(offset + Vector256<ushort>.Count))),
Avx2.UnpackHigh(asciiVectorHigh, zeroVector));
offset += Vector256<byte>.Count;
}
// Do final widen vector size from end, which may overlap slightly with the final loop iteration.
offset = input.Length - Vector256<byte>.Count;
asciiVector = Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.Add(ref inputStart, (IntPtr)offset));
asciiVectorLow = Avx2.Permute4x64(asciiVector.AsUInt64(), 0xd4).AsByte();
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)offset)),
Avx2.UnpackLow(asciiVectorLow, zeroVector));
asciiVectorHigh = Avx2.Permute4x64(asciiVector.AsUInt64(), 0xe8).AsByte();
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)(offset + Vector256<ushort>.Count))),
Avx2.UnpackHigh(asciiVectorHigh, zeroVector));
return; // All done
}
length = input.Length - Vector128<byte>.Count;
if (Sse2.IsSupported && length >= 0)
{
Vector128<byte> asciiVector;
Vector128<byte> zeroVector = Vector128<byte>.Zero;
// First time this checks again against 0, however we will move into final widen if it fails.
while (length > offset)
{
asciiVector = Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.Add(ref inputStart, (IntPtr)offset));
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)offset)),
Sse2.UnpackLow(asciiVector, zeroVector));
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)(offset + Vector128<ushort>.Count))),
Sse2.UnpackHigh(asciiVector, zeroVector));
offset += Vector128<byte>.Count;
}
// Do final widen vector size from end, which may overlap slightly with the final loop iteration.
offset = input.Length - Vector128<byte>.Count;
asciiVector = Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.Add(ref inputStart, (IntPtr)offset));
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)offset)),
Sse2.UnpackLow(asciiVector, zeroVector));
Unsafe.WriteUnaligned(
ref Unsafe.As<char, byte>(
ref Unsafe.Add(ref outputStart, (IntPtr)(offset + Vector128<ushort>.Count))),
Sse2.UnpackHigh(asciiVector, zeroVector));
return; // All done
}
// The reset is exersise for the reader to improve
for (int i = 0; i < input.Length; i++)
{
Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref outputStart, (IntPtr)i)), (char)input[i]);
}
}
}
@benaadams
Copy link
Author

G_M18455_IG01:
       push     rdi
       push     rsi
       push     rbp
       push     rbx
       sub      rsp, 56
       vzeroupper 
       xor      rax, rax
       mov      qword ptr [rsp+28H], rax
       mov      qword ptr [rsp+30H], rax
       mov      rdi, rdx
       mov      rsi, r8
						;; bbWeight=1    PerfScore 8.00
G_M18455_IG02:
       xor      rbx, rbx
       xor      ebp, ebp
       mov      rcx, gword ptr [rsi]
       test     rcx, rcx
       je       SHORT G_M18455_IG08
						;; bbWeight=1    PerfScore 3.75
G_M18455_IG03:
       mov      rdx, 0xD1FFAB1E
       cmp      qword ptr [rcx], rdx
       jne      SHORT G_M18455_IG04
       lea      rbx, bword ptr [rcx+12]
       mov      ebp, dword ptr [rcx+8]
       jmp      SHORT G_M18455_IG06
						;; bbWeight=0.25 PerfScore 1.94
G_M18455_IG04:
       mov      rdx, qword ptr [rcx]
       test     dword ptr [rdx], 0xD1FFAB1E
       je       SHORT G_M18455_IG05
       lea      rbx, bword ptr [rcx+16]
       mov      ebp, dword ptr [rcx+8]
       jmp      SHORT G_M18455_IG06
						;; bbWeight=0.25 PerfScore 2.38
G_M18455_IG05:
       lea      rdx, bword ptr [rsp+28H]
       mov      rax, qword ptr [rcx]
       mov      rax, qword ptr [rax+64]
       call     qword ptr [rax+40]MemoryManager`1:GetSpan():Span`1:this
       mov      rbx, bword ptr [rsp+28H]
       mov      ebp, dword ptr [rsp+30H]
						;; bbWeight=0.25 PerfScore 2.38
G_M18455_IG06:
       mov      eax, dword ptr [rsi+8]
       and      eax, 0xD1FFAB1E
       mov      edx, dword ptr [rsi+12]
       mov      ecx, edx
       add      rcx, rax
       mov      r8d, ebp
       cmp      rcx, r8
       ja       G_M18455_IG20
						;; bbWeight=0.25 PerfScore 1.56
G_M18455_IG07:
       add      rbx, rax
       mov      ebp, edx
						;; bbWeight=0.25 PerfScore 0.13
G_M18455_IG08:
       mov      rax, rbx
       mov      rdx, rax
       mov      rcx, bword ptr [rdi]
       xor      r8, r8
       lea      r9d, [rbp-32]
       movsxd   r9, r9d
       test     r9, r9
       jl       G_M18455_IG13
						;; bbWeight=1    PerfScore 4.75
G_M18455_IG09:
       vxorps   ymm0, ymm0, ymm0
       test     r9, r9
       jle      SHORT G_M18455_IG11
						;; bbWeight=0.50 PerfScore 0.79
G_M18455_IG10:
       vmovupd  ymm1, ymmword ptr[rdx+r8]
       vpermq   ymm2, ymm1, -44
       vpunpcklbw ymm2, ymm2, ymm0
       vmovupd  ymmword ptr[rcx+2*r8], ymm2
       vpermq   ymm1, ymm1, -24
       lea      rax, [r8+16]
       vpunpckhbw ymm1, ymm1, ymm0
       vmovupd  ymmword ptr[rcx+2*rax], ymm1
       add      r8, 32
       cmp      r9, r8
       jg       SHORT G_M18455_IG10
						;; bbWeight=4    PerfScore 72.00
G_M18455_IG11:
       add      ebp, -32
       movsxd   r8, ebp
       vmovupd  ymm1, ymmword ptr[rdx+r8]
       vpermq   ymm2, ymm1, -44
       vpunpcklbw ymm2, ymm2, ymm0
       vmovupd  ymmword ptr[rcx+2*r8], ymm2
       vpermq   ymm1, ymm1, -24
       lea      r9, [r8+16]
       vpunpckhbw ymm0, ymm1, ymm0
       vmovupd  ymmword ptr[rcx+2*r9], ymm0
						;; bbWeight=0.50 PerfScore 8.50
G_M18455_IG12:
       vzeroupper 
       add      rsp, 56
       pop      rbx
       pop      rbp
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=0.50 PerfScore 2.13
G_M18455_IG13:
       lea      r9d, [rbp-16]
       movsxd   r10, r9d
       mov      r9, r10
       test     r9, r9
       jl       SHORT G_M18455_IG17
       vxorps   xmm0, xmm0, xmm0
       test     r9, r9
       jle      SHORT G_M18455_IG15
						;; bbWeight=0.50 PerfScore 1.92
G_M18455_IG14:
       vmovupd  xmm1, xmmword ptr [rdx+r8]
       vpunpcklbw xmm2, xmm1, xmm0
       vmovupd  xmmword ptr [rcx+2*r8], xmm2
       lea      rax, [r8+8]
       vpunpckhbw xmm1, xmm1, xmm0
       vmovupd  xmmword ptr [rcx+2*rax], xmm1
       add      r8, 16
       cmp      r9, r8
       jg       SHORT G_M18455_IG14
						;; bbWeight=4    PerfScore 56.00
G_M18455_IG15:
       mov      r8, r10
       vmovupd  xmm1, xmmword ptr [rdx+r8]
       vpunpcklbw xmm2, xmm1, xmm0
       vmovupd  xmmword ptr [rcx+2*r8], xmm2
       lea      rax, [r8+8]
       vpunpckhbw xmm0, xmm1, xmm0
       vmovupd  xmmword ptr [rcx+2*rax], xmm0
						;; bbWeight=0.50 PerfScore 6.38
G_M18455_IG16:
       vzeroupper 
       add      rsp, 56
       pop      rbx
       pop      rbp
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=0.50 PerfScore 2.13
G_M18455_IG17:
       xor      edx, edx
       test     ebp, ebp
       jle      SHORT G_M18455_IG19
						;; bbWeight=0.50 PerfScore 0.75
G_M18455_IG18:
       movsxd   r8, edx
       movsxd   r9, edx
       movzx    r9, byte  ptr [rax+r9]
       mov      word  ptr [rcx+2*r8], r9w
       inc      edx
       cmp      edx, ebp
       jl       SHORT G_M18455_IG18
						;; bbWeight=4    PerfScore 20.00
G_M18455_IG19:
       vzeroupper 
       add      rsp, 56
       pop      rbx
       pop      rbp
       pop      rsi
       pop      rdi
       ret      
						;; bbWeight=0.50 PerfScore 2.13
G_M18455_IG20:
       call     ThrowHelper:ThrowArgumentOutOfRangeException()
       int3     
						;; bbWeight=0    PerfScore 0.00

; Total bytes of code 441, prolog size 29, PerfScore 242.98, (MethodHash=e529b842) for method Conversion:AsciiToString(Span`1,ReadOnlyMemory`1):this
; ============================================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment