Skip to content

Instantly share code, notes, and snippets.

@commonsensesoftware
Last active January 15, 2020 03:10
Show Gist options
  • Save commonsensesoftware/aa59ac440650437b6baa607b21b28522 to your computer and use it in GitHub Desktop.
Save commonsensesoftware/aa59ac440650437b6baa607b21b28522 to your computer and use it in GitHub Desktop.
Sample Base64Url Implementation
namespace Microsoft.Example
{
using System;
using System.Buffers;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using Microsoft.Example.Text;
using static System.Text.Encoding;
/// <summary>
/// Represents the Base 64 Encoding with URL and Filename Safe Alphabet
/// <seealso href="https://tools.ietf.org/html/rfc4648#section-4"/>.
/// </summary>
public static class Base64Url
{
// REF: based on value found in: https://github.com/dotnet/aspnetcore/blob/master/src/Shared/WebEncoders/WebEncoders.cs
private const int RentSize = 128;
private const byte Pad = (byte)'=';
/// <summary>
/// Encodes the specified binary as a Base 64 URL.
/// </summary>
/// <param name="binary">The binary to encode.</param>
/// <returns>The encoded text in the Base 64 URL format.</returns>
public static string Encode(ReadOnlySpan<byte> binary)
{
var bufferSize = Base64UrlEncoding.GetMaxEncodedToUtf8Length(binary.Length);
var rented = default(byte[]);
Span<byte> buffer = bufferSize <= RentSize ? stackalloc byte[bufferSize] : Rent(bufferSize, ref rented);
Base64UrlEncoding.EncodeToUtf8(binary, buffer, out var consumed, out var written);
var length = LengthWithoutPadding(buffer, written);
var base64Url = ASCII.GetString(buffer.Slice(0, length));
if (rented != null)
{
ArrayPool<byte>.Shared.Return(rented);
}
return base64Url;
}
/// <summary>
/// Encodes the specified binary as a Base 64 URL.
/// </summary>
/// <param name="binary">The binary to encode.</param>
/// <returns>The encoded text in the Base 64 URL format.</returns>
public static string Encode(byte[] binary) => Encode(binary.AsSpan());
/// <summary>
/// Encodes the specified text as a Base 64 URL.
/// </summary>
/// <param name="text">The text to encode.</param>
/// <returns>The encoded text in the Base 64 URL format.</returns>
public static string Encode(string text)
{
var bufferSize = UTF8.GetMaxByteCount(text.Length);
var rented = default(byte[]);
Span<byte> buffer = bufferSize <= RentSize ? stackalloc byte[bufferSize] : Rent(bufferSize, ref rented);
var length = UTF8.GetBytes(text.AsSpan(), buffer);
var base64Url = Encode(buffer.Slice(0, length));
if (rented != null)
{
ArrayPool<byte>.Shared.Return(rented);
}
return base64Url;
}
/// <summary>
/// Decodes the specified Base 64 URL as plain text.
/// </summary>
/// <param name="base64Url">The Base 64 URL encoded value to decode.</param>
/// <returns>The decoded, plain text.</returns>
public static string Decode(string base64Url)
{
var buffer = base64Url.AsSpan();
var length = buffer.Length;
var padding = ComputeRequiredPadding(length);
var inputSize = length + padding;
var outputSize = Base64UrlEncoding.GetMaxDecodedFromUtf8Length(inputSize);
var rentedInput = default(byte[]);
var rentedOutput = default(byte[]);
Span<byte> input = inputSize <= RentSize ? stackalloc byte[inputSize] : Rent(inputSize, ref rentedInput);
Span<byte> output = outputSize <= RentSize ? stackalloc byte[outputSize] : Rent(outputSize, ref rentedOutput);
ASCII.GetBytes(buffer, input);
AppendPadding(input, padding);
Base64UrlEncoding.DecodeFromUtf8(input, output, out var consumed, out var written);
if (rentedInput != null)
{
ArrayPool<byte>.Shared.Return(rentedInput);
}
var text = UTF8.GetString(output.Slice(0, written));
if (rentedOutput != null)
{
ArrayPool<byte>.Shared.Return(rentedOutput);
}
return text;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Span<byte> Rent(int size, [AllowNull] ref byte[] rented) =>
new Span<byte>(rented = ArrayPool<byte>.Shared.Rent(size), 0, size);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LengthWithoutPadding(Span<byte> output, int size)
{
if (output[^2] == Pad)
{
return size - 2;
}
if (output[^1] == Pad)
{
return size - 1;
}
return size;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ComputeRequiredPadding(int length) => 4 - (length % 4);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void AppendPadding(Span<byte> input, int amount) =>
input.Slice(input.Length - amount, amount).Fill(Pad);
}
}
namespace Microsoft.Example.Text
{
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
internal static partial class Base64UrlEncoding
{
private static TVector ReadVector<TVector>(ReadOnlySpan<sbyte> data)
{
ref sbyte tmp = ref MemoryMarshal.GetReference(data);
return Unsafe.As<sbyte, TVector>(ref tmp);
}
}
}
namespace Microsoft.Example.Text
{
using System;
using System.Buffers;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
internal static partial class Base64UrlEncoding
{
public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Span<byte> bytes, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
{
if (utf8.IsEmpty)
{
bytesConsumed = 0;
bytesWritten = 0;
return OperationStatus.Done;
}
fixed (byte* srcBytes = &MemoryMarshal.GetReference(utf8))
fixed (byte* destBytes = &MemoryMarshal.GetReference(bytes))
{
int srcLength = utf8.Length & ~0x3; // only decode input up to the closest multiple of 4.
int destLength = bytes.Length;
int maxSrcLength = srcLength;
int decodedLength = GetMaxDecodedFromUtf8Length(srcLength);
// max. 2 padding chars
if (destLength < decodedLength - 2)
{
// For overflow see comment below
maxSrcLength = destLength / 3 * 4;
}
byte* src = srcBytes;
byte* dest = destBytes;
byte* srcEnd = srcBytes + (uint)srcLength;
byte* srcMax = srcBytes + (uint)maxSrcLength;
if (maxSrcLength >= 24)
{
byte* end = srcMax - 45;
if (Avx2.IsSupported && (end >= src))
{
Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
if (src == srcEnd)
{
goto DoneExit;
}
}
end = srcMax - 24;
if (Ssse3.IsSupported && (end >= src))
{
Ssse3Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
if (src == srcEnd)
{
goto DoneExit;
}
}
}
// Last bytes could have padding characters, so process them separately and treat them as valid only if isFinalBlock is true
// if isFinalBlock is false, padding characters are considered invalid
int skipLastChunk = isFinalBlock ? 4 : 0;
if (destLength >= decodedLength)
{
maxSrcLength = srcLength - skipLastChunk;
}
else
{
// This should never overflow since destLength here is less than int.MaxValue / 4 * 3 (i.e. 1610612733)
// Therefore, (destLength / 3) * 4 will always be less than 2147483641
maxSrcLength = (destLength / 3) * 4;
}
ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
srcMax = srcBytes + (uint)maxSrcLength;
while (src < srcMax)
{
int result = Decode(src, ref decodingMap);
if (result < 0)
{
goto InvalidDataExit;
}
WriteThreeLowOrderBytes(dest, result);
src += 4;
dest += 3;
}
if (maxSrcLength != srcLength - skipLastChunk)
{
goto DestinationTooSmallExit;
}
// If input is less than 4 bytes, srcLength == sourceIndex == 0
// If input is not a multiple of 4, sourceIndex == srcLength != 0
if (src == srcEnd)
{
if (isFinalBlock)
{
goto InvalidDataExit;
}
goto NeedMoreDataExit;
}
// if isFinalBlock is false, we will never reach this point
// Handle last four bytes. There are 0, 1, 2 padding chars.
uint t0 = srcEnd[-4];
uint t1 = srcEnd[-3];
uint t2 = srcEnd[-2];
uint t3 = srcEnd[-1];
int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
i0 <<= 18;
i1 <<= 12;
i0 |= i1;
byte* destMax = destBytes + (uint)destLength;
if (t3 != EncodingPad)
{
int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
i2 <<= 6;
i0 |= i3;
i0 |= i2;
if (i0 < 0)
{
goto InvalidDataExit;
}
if (dest + 3 > destMax)
{
goto DestinationTooSmallExit;
}
WriteThreeLowOrderBytes(dest, i0);
dest += 3;
}
else if (t2 != EncodingPad)
{
int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
i2 <<= 6;
i0 |= i2;
if (i0 < 0)
{
goto InvalidDataExit;
}
if (dest + 2 > destMax)
{
goto DestinationTooSmallExit;
}
dest[0] = (byte)(i0 >> 16);
dest[1] = (byte)(i0 >> 8);
dest += 2;
}
else
{
if (i0 < 0)
{
goto InvalidDataExit;
}
if (dest + 1 > destMax)
{
goto DestinationTooSmallExit;
}
dest[0] = (byte)(i0 >> 16);
dest += 1;
}
src += 4;
if (srcLength != utf8.Length)
{
goto InvalidDataExit;
}
DoneExit:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.Done;
DestinationTooSmallExit:
if (srcLength != utf8.Length && isFinalBlock)
{
goto InvalidDataExit; // if input is not a multiple of 4, and there is no more data, return invalid data instead
}
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.DestinationTooSmall;
NeedMoreDataExit:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.NeedMoreData;
InvalidDataExit:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.InvalidData;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetMaxDecodedFromUtf8Length(int length)
{
if (length < 0)
{
throw new ArgumentOutOfRangeException(nameof(length));
}
return (length >> 2) * 3;
}
public static unsafe OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, out int bytesWritten)
{
if (buffer.IsEmpty)
{
bytesWritten = 0;
return OperationStatus.Done;
}
fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer))
{
int bufferLength = buffer.Length;
uint sourceIndex = 0;
uint destIndex = 0;
// only decode input if it is a multiple of 4
if (bufferLength != ((bufferLength >> 2) * 4))
{
goto InvalidExit;
}
if (bufferLength == 0)
{
goto DoneExit;
}
ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
while (sourceIndex < bufferLength - 4)
{
int result = Decode(bufferBytes + sourceIndex, ref decodingMap);
if (result < 0)
{
goto InvalidExit;
}
WriteThreeLowOrderBytes(bufferBytes + destIndex, result);
destIndex += 3;
sourceIndex += 4;
}
uint t0 = bufferBytes[bufferLength - 4];
uint t1 = bufferBytes[bufferLength - 3];
uint t2 = bufferBytes[bufferLength - 2];
uint t3 = bufferBytes[bufferLength - 1];
int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
i0 <<= 18;
i1 <<= 12;
i0 |= i1;
if (t3 != EncodingPad)
{
int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
i2 <<= 6;
i0 |= i3;
i0 |= i2;
if (i0 < 0)
{
goto InvalidExit;
}
WriteThreeLowOrderBytes(bufferBytes + destIndex, i0);
destIndex += 3;
}
else if (t2 != EncodingPad)
{
int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
i2 <<= 6;
i0 |= i2;
if (i0 < 0)
{
goto InvalidExit;
}
bufferBytes[destIndex] = (byte)(i0 >> 16);
bufferBytes[destIndex + 1] = (byte)(i0 >> 8);
destIndex += 2;
}
else
{
if (i0 < 0)
{
goto InvalidExit;
}
bufferBytes[destIndex] = (byte)(i0 >> 16);
destIndex += 1;
}
DoneExit:
bytesWritten = (int)destIndex;
return OperationStatus.Done;
InvalidExit:
bytesWritten = (int)destIndex;
return OperationStatus.InvalidData;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
{
// If we have AVX2 support, pick off 32 bytes at a time for as long as we can,
// but make sure that we quit before seeing any == markers at the end of the
// string. Also, because we write 8 zeroes at the end of the output, ensure
// that there are at least 11 valid bytes of input data remaining to close the
// gap. 32 + 2 + 11 = 45 bytes.
// See SSSE3-version below for an explanation of how the code works.
// The JIT won't hoist these "constants", so help it
Vector256<sbyte> lutHi = ReadVector<Vector256<sbyte>>(s_avxDecodeLutHi);
Vector256<sbyte> lutLo = ReadVector<Vector256<sbyte>>(s_avxDecodeLutLo);
Vector256<sbyte> lutShift = ReadVector<Vector256<sbyte>>(s_avxDecodeLutShift);
Vector256<sbyte> mask2F = Vector256.Create((sbyte)'/');
Vector256<sbyte> mergeConstant0 = Vector256.Create(0x01400140).AsSByte();
Vector256<short> mergeConstant1 = Vector256.Create(0x00011000).AsInt16();
Vector256<sbyte> packBytesInLaneMask = ReadVector<Vector256<sbyte>>(s_avxDecodePackBytesInLaneMask);
Vector256<int> packLanesControl = ReadVector<Vector256<sbyte>>(s_avxDecodePackLanesControl).AsInt32();
byte* src = srcBytes;
byte* dest = destBytes;
//while (remaining >= 45)
do
{
Vector256<sbyte> str = Avx.LoadVector256(src).AsSByte();
Vector256<sbyte> hiNibbles = Avx2.And(Avx2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F);
Vector256<sbyte> loNibbles = Avx2.And(str, mask2F);
Vector256<sbyte> hi = Avx2.Shuffle(lutHi, hiNibbles);
Vector256<sbyte> lo = Avx2.Shuffle(lutLo, loNibbles);
if (!Avx.TestZ(lo, hi))
{
break;
}
Vector256<sbyte> eq2F = Avx2.CompareEqual(str, mask2F);
Vector256<sbyte> shift = Avx2.Shuffle(lutShift, Avx2.Add(eq2F, hiNibbles));
str = Avx2.Add(str, shift);
// in, lower lane, bits, upper case are most significant bits, lower case are least significant bits:
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
Vector256<short> merge_ab_and_bc = Avx2.MultiplyAddAdjacent(str.AsByte(), mergeConstant0);
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
Vector256<int> output = Avx2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together in each lane:
output = Avx2.Shuffle(output.AsSByte(), packBytesInLaneMask).AsInt32();
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
// Pack lanes
str = Avx2.PermuteVar8x32(output, packLanesControl).AsSByte();
Avx.Store(dest, str.AsByte());
src += 32;
dest += 24;
}
while (src <= srcEnd);
srcBytes = src;
destBytes = dest;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
{
// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
// but make sure that we quit before seeing any == markers at the end of the
// string. Also, because we write four zeroes at the end of the output, ensure
// that there are at least 6 valid bytes of input data remaining to close the
// gap. 16 + 2 + 6 = 24 bytes.
// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input
// We will use LUTS for character validation & offset computation
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8,
// this allows to mask with 0x2F instead of 0x0F and thus save one constant declaration (register and/or memory access)
// For offsets:
// Perfect hash for lut = ((src>>4)&0x2F)+((src==0x2F)?0xFF:0x00)
// 0000 = garbage
// 0001 = /
// 0010 = +
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = a-z
// 0111 = a-z
// 1000 >= garbage
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
// 0000 0X10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
// 0100 0x04 char @ A B C D E F G H I J K L M N 0
// andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
// 0110 0x04 char ` a b c d e f g h i j k l m n o
// andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
// 0111 0X08 char p q r s t u v w x y z { | } ~
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// The JIT won't hoist these "constants", so help it
Vector128<sbyte> lutHi = ReadVector<Vector128<sbyte>>(s_sseDecodeLutHi);
Vector128<sbyte> lutLo = ReadVector<Vector128<sbyte>>(s_sseDecodeLutLo);
Vector128<sbyte> lutShift = ReadVector<Vector128<sbyte>>(s_sseDecodeLutShift);
Vector128<sbyte> mask2F = Vector128.Create((sbyte)'/');
Vector128<sbyte> mergeConstant0 = Vector128.Create(0x01400140).AsSByte();
Vector128<short> mergeConstant1 = Vector128.Create(0x00011000).AsInt16();
Vector128<sbyte> packBytesMask = ReadVector<Vector128<sbyte>>(s_sseDecodePackBytesMask);
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
byte* src = srcBytes;
byte* dest = destBytes;
//while (remaining >= 24)
do
{
Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
// lookup
Vector128<sbyte> hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F);
Vector128<sbyte> loNibbles = Sse2.And(str, mask2F);
Vector128<sbyte> hi = Ssse3.Shuffle(lutHi, hiNibbles);
Vector128<sbyte> lo = Ssse3.Shuffle(lutLo, loNibbles);
// Check for invalid input: if any "and" values from lo and hi are not zero,
// fall back on bytewise code to do error checking and reporting:
if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0)
{
break;
}
Vector128<sbyte> eq2F = Sse2.CompareEqual(str, mask2F);
Vector128<sbyte> shift = Ssse3.Shuffle(lutShift, Sse2.Add(eq2F, hiNibbles));
// Now simply add the delta values to the input:
str = Sse2.Add(str, shift);
// in, bits, upper case are most significant bits, lower case are least significant bits
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
Vector128<short> merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), mergeConstant0);
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
Vector128<int> output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together:
str = Ssse3.Shuffle(output.AsSByte(), packBytesMask);
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
Sse2.Store(dest, str.AsByte());
src += 16;
dest += 12;
}
while (src <= srcEnd);
srcBytes = src;
destBytes = dest;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe int Decode(byte* encodedBytes, ref sbyte decodingMap)
{
uint t0 = encodedBytes[0];
uint t1 = encodedBytes[1];
uint t2 = encodedBytes[2];
uint t3 = encodedBytes[3];
int i0 = Unsafe.Add(ref decodingMap, (IntPtr)t0);
int i1 = Unsafe.Add(ref decodingMap, (IntPtr)t1);
int i2 = Unsafe.Add(ref decodingMap, (IntPtr)t2);
int i3 = Unsafe.Add(ref decodingMap, (IntPtr)t3);
i0 <<= 18;
i1 <<= 12;
i2 <<= 6;
i0 |= i3;
i1 |= i2;
i0 |= i1;
return i0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
{
destination[0] = (byte)(value >> 16);
destination[1] = (byte)(value >> 8);
destination[2] = (byte)value;
}
// Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests)
private static ReadOnlySpan<sbyte> s_decodingMap => new sbyte[] {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, //62 is placed at index 45 (for -)
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, //52-61 are placed at index 48-57 (for 0-9), 64 at index 61 (for =)
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, 63, //0-25 are placed at index 65-90 (for A-Z), 63 at index 95 (for _)
-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, //26-51 are placed at index 97-122 (for a-z)
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bytes over 122 ('z') are invalid and cannot be decoded
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Hence, padding the map with 255, which indicates invalid input
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};
private static ReadOnlySpan<sbyte> s_sseDecodePackBytesMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};
private static ReadOnlySpan<sbyte> s_sseDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};
private static ReadOnlySpan<sbyte> s_sseDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};
private static ReadOnlySpan<sbyte> s_sseDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};
private static ReadOnlySpan<sbyte> s_avxDecodePackBytesInLaneMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1,
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};
private static ReadOnlySpan<sbyte> s_avxDecodePackLanesControl => new sbyte[] {
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0,
-1, -1, -1, -1,
-1, -1, -1, -1
};
private static ReadOnlySpan<sbyte> s_avxDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};
private static ReadOnlySpan<sbyte> s_avxDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};
private static ReadOnlySpan<sbyte> s_avxDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0,
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};
}
}
namespace Microsoft.Example.Text
{
using System;
using System.Buffers;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
internal static partial class Base64UrlEncoding
{
public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan<byte> bytes, Span<byte> utf8, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
{
if (bytes.IsEmpty)
{
bytesConsumed = 0;
bytesWritten = 0;
return OperationStatus.Done;
}
fixed (byte* srcBytes = &MemoryMarshal.GetReference(bytes))
fixed (byte* destBytes = &MemoryMarshal.GetReference(utf8))
{
int srcLength = bytes.Length;
int destLength = utf8.Length;
int maxSrcLength;
if (srcLength <= MaximumEncodeLength && destLength >= GetMaxEncodedToUtf8Length(srcLength))
{
maxSrcLength = srcLength;
}
else
{
maxSrcLength = (destLength >> 2) * 3;
}
byte* src = srcBytes;
byte* dest = destBytes;
byte* srcEnd = srcBytes + (uint)srcLength;
byte* srcMax = srcBytes + (uint)maxSrcLength;
if (maxSrcLength >= 16)
{
byte* end = srcMax - 32;
if (Avx2.IsSupported && (end >= src))
{
Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
if (src == srcEnd)
{
goto DoneExit;
}
}
end = srcMax - 16;
if (Ssse3.IsSupported && (end >= src))
{
Ssse3Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
if (src == srcEnd)
{
goto DoneExit;
}
}
}
ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap);
uint result = 0;
srcMax -= 2;
while (src < srcMax)
{
result = Encode(src, ref encodingMap);
Unsafe.WriteUnaligned(dest, result);
src += 3;
dest += 4;
}
if (srcMax + 2 != srcEnd)
{
goto DestinationTooSmallExit;
}
if (!isFinalBlock)
{
goto NeedMoreData;
}
if (src + 1 == srcEnd)
{
result = EncodeAndPadTwo(src, ref encodingMap);
Unsafe.WriteUnaligned(dest, result);
src += 1;
dest += 4;
}
else if (src + 2 == srcEnd)
{
result = EncodeAndPadOne(src, ref encodingMap);
Unsafe.WriteUnaligned(dest, result);
src += 2;
dest += 4;
}
DoneExit:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.Done;
DestinationTooSmallExit:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.DestinationTooSmall;
NeedMoreData:
bytesConsumed = (int)(src - srcBytes);
bytesWritten = (int)(dest - destBytes);
return OperationStatus.NeedMoreData;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetMaxEncodedToUtf8Length(int length)
{
if ((uint)length > MaximumEncodeLength)
{
throw new ArgumentOutOfRangeException(nameof(length));
}
return ((length + 2) / 3) * 4;
}
public static unsafe OperationStatus EncodeToUtf8InPlace(Span<byte> buffer, int dataLength, out int bytesWritten)
{
if (buffer.IsEmpty)
{
bytesWritten = 0;
return OperationStatus.Done;
}
fixed (byte* bufferBytes = &MemoryMarshal.GetReference(buffer))
{
int encodedLength = GetMaxEncodedToUtf8Length(dataLength);
if (buffer.Length < encodedLength)
{
goto FalseExit;
}
int leftover = dataLength - ((dataLength / 3) * 3); // how many bytes after packs of 3
uint destinationIndex = (uint)(encodedLength - 4);
uint sourceIndex = (uint)(dataLength - leftover);
uint result = 0;
ref byte encodingMap = ref MemoryMarshal.GetReference(s_encodingMap);
// encode last pack to avoid conditional in the main loop
if (leftover != 0)
{
if (leftover == 1)
{
result = EncodeAndPadTwo(bufferBytes + sourceIndex, ref encodingMap);
}
else
{
result = EncodeAndPadOne(bufferBytes + sourceIndex, ref encodingMap);
}
Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result);
destinationIndex -= 4;
}
sourceIndex -= 3;
while ((int)sourceIndex >= 0)
{
result = Encode(bufferBytes + sourceIndex, ref encodingMap);
Unsafe.WriteUnaligned(bufferBytes + destinationIndex, result);
destinationIndex -= 4;
sourceIndex -= 3;
}
bytesWritten = encodedLength;
return OperationStatus.Done;
FalseExit:
bytesWritten = 0;
return OperationStatus.DestinationTooSmall;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
{
// If we have AVX2 support, pick off 24 bytes at a time for as long as we can.
// But because we read 32 bytes at a time, ensure we have enough room to do a
// full 32-byte read without segfaulting.
// translation from SSSE3 into AVX2 of procedure
// This one works with shifted (4 bytes) input in order to
// be able to work efficiently in the 2 128-bit lanes
// srcBytes, bytes MSB to LSB:
// 0 0 0 0 x w v u t s r q p o n m
// l k j i h g f e d c b a 0 0 0 0
// The JIT won't hoist these "constants", so help it
Vector256<sbyte> shuffleVec = ReadVector<Vector256<sbyte>>(s_avxEncodeShuffleVec);
Vector256<sbyte> maskAC = Vector256.Create(0x0fc0fc00).AsSByte();
Vector256<sbyte> maskBB = Vector256.Create(0x003f03f0).AsSByte();
Vector256<ushort> shiftAC = Vector256.Create(0x04000040).AsUInt16();
Vector256<short> shiftBB = Vector256.Create(0x01000010).AsInt16();
Vector256<byte> const51 = Vector256.Create((byte)51);
Vector256<sbyte> const25 = Vector256.Create((sbyte)25);
Vector256<sbyte> lut = ReadVector<Vector256<sbyte>>(s_avxEncodeLut);
byte* src = srcBytes;
byte* dest = destBytes;
// first load is done at c-0 not to get a segfault
Vector256<sbyte> str = Avx.LoadVector256(src).AsSByte();
// shift by 4 bytes, as required by Reshuffle
str = Avx2.PermuteVar8x32(str.AsInt32(), ReadVector<Vector256<sbyte>>(s_avxEncodePermuteVec).AsInt32()).AsSByte();
// Next loads are done at src-4, as required by Reshuffle, so shift it once
src -= 4;
while (true)
{
// Reshuffle
str = Avx2.Shuffle(str, shuffleVec);
// str, bytes MSB to LSB:
// w x v w
// t u s t
// q r p q
// n o m n
// k l j k
// h i g h
// e f d e
// b c a b
Vector256<sbyte> t0 = Avx2.And(str, maskAC);
// bits, upper case are most significant bits, lower case are least significant bits.
// 0000wwww XX000000 VVVVVV00 00000000
// 0000tttt UU000000 SSSSSS00 00000000
// 0000qqqq RR000000 PPPPPP00 00000000
// 0000nnnn OO000000 MMMMMM00 00000000
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
Vector256<sbyte> t2 = Avx2.And(str, maskBB);
// 00000000 00xxxxxx 000000vv WWWW0000
// 00000000 00uuuuuu 000000ss TTTT0000
// 00000000 00rrrrrr 000000pp QQQQ0000
// 00000000 00oooooo 000000mm NNNN0000
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
Vector256<ushort> t1 = Avx2.MultiplyHigh(t0.AsUInt16(), shiftAC);
// 00000000 00wwwwXX 00000000 00VVVVVV
// 00000000 00ttttUU 00000000 00SSSSSS
// 00000000 00qqqqRR 00000000 00PPPPPP
// 00000000 00nnnnOO 00000000 00MMMMMM
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
Vector256<short> t3 = Avx2.MultiplyLow(t2.AsInt16(), shiftBB);
// 00xxxxxx 00000000 00vvWWWW 00000000
// 00uuuuuu 00000000 00ssTTTT 00000000
// 00rrrrrr 00000000 00ppQQQQ 00000000
// 00oooooo 00000000 00mmNNNN 00000000
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
str = Avx2.Or(t1.AsSByte(), t3.AsSByte());
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
// Translation
// LUT contains Absolute offset for all ranges:
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from input:
// the index for range #0 is right, others are 1 less than expected:
Vector256<byte> indices = Avx2.SubtractSaturate(str.AsByte(), const51);
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
Vector256<sbyte> mask = Avx2.CompareGreaterThan(str, const25);
// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
Vector256<sbyte> tmp = Avx2.Subtract(indices.AsSByte(), mask);
// Add offsets to input values:
str = Avx2.Add(str, Avx2.Shuffle(lut, tmp));
Avx.Store(dest, str.AsByte());
src += 24;
dest += 32;
if (src > srcEnd)
{
break;
}
// Load at src-4, as required by Reshuffle (already shifted by -4)
str = Avx.LoadVector256(src).AsSByte();
}
srcBytes = src + 4;
destBytes = dest;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
{
// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can.
// But because we read 16 bytes at a time, ensure we have enough room to do a
// full 16-byte read without segfaulting.
// srcBytes, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a
// The JIT won't hoist these "constants", so help it
Vector128<sbyte> shuffleVec = ReadVector<Vector128<sbyte>>(s_sseEncodeShuffleVec);
Vector128<sbyte> maskAC = Vector128.Create(0x0fc0fc00).AsSByte();
Vector128<sbyte> maskBB = Vector128.Create(0x003f03f0).AsSByte();
Vector128<ushort> shiftAC = Vector128.Create(0x04000040).AsUInt16();
Vector128<short> shiftBB = Vector128.Create(0x01000010).AsInt16();
Vector128<byte> const51 = Vector128.Create((byte)51);
Vector128<sbyte> const25 = Vector128.Create((sbyte)25);
Vector128<sbyte> lut = ReadVector<Vector128<sbyte>>(s_sseEncodeLut);
byte* src = srcBytes;
byte* dest = destBytes;
//while (remaining >= 16)
do
{
Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
// Reshuffle
str = Ssse3.Shuffle(str, shuffleVec);
// str, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b
Vector128<sbyte> t0 = Sse2.And(str, maskAC);
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
Vector128<sbyte> t2 = Sse2.And(str, maskBB);
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
Vector128<ushort> t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shiftAC);
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
Vector128<short> t3 = Sse2.MultiplyLow(t2.AsInt16(), shiftBB);
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
str = Sse2.Or(t1.AsSByte(), t3.AsSByte());
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
// Translation
// LUT contains Absolute offset for all ranges:
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from input:
// the index for range #0 is right, others are 1 less than expected:
Vector128<byte> indices = Sse2.SubtractSaturate(str.AsByte(), const51);
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
Vector128<sbyte> mask = Sse2.CompareGreaterThan(str, const25);
// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
Vector128<sbyte> tmp = Sse2.Subtract(indices.AsSByte(), mask);
// Add offsets to input values:
str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp));
Sse2.Store(dest, str.AsByte());
src += 12;
dest += 16;
}
while (src <= srcEnd);
srcBytes = src;
destBytes = dest;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe uint Encode(byte* threeBytes, ref byte encodingMap)
{
uint t0 = threeBytes[0];
uint t1 = threeBytes[1];
uint t2 = threeBytes[2];
uint i = (t0 << 16) | (t1 << 8) | t2;
uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18));
uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F));
uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F));
uint i3 = Unsafe.Add(ref encodingMap, (IntPtr)(i & 0x3F));
return i0 | (i1 << 8) | (i2 << 16) | (i3 << 24);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe uint EncodeAndPadOne(byte* twoBytes, ref byte encodingMap)
{
uint t0 = twoBytes[0];
uint t1 = twoBytes[1];
uint i = (t0 << 16) | (t1 << 8);
uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 18));
uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 12) & 0x3F));
uint i2 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 6) & 0x3F));
return i0 | (i1 << 8) | (i2 << 16) | (EncodingPad << 24);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe uint EncodeAndPadTwo(byte* oneByte, ref byte encodingMap)
{
uint t0 = oneByte[0];
uint i = t0 << 8;
uint i0 = Unsafe.Add(ref encodingMap, (IntPtr)(i >> 10));
uint i1 = Unsafe.Add(ref encodingMap, (IntPtr)((i >> 4) & 0x3F));
return i0 | (i1 << 8) | (EncodingPad << 16) | (EncodingPad << 24);
}
private const uint EncodingPad = '='; // '=', for padding
private const int MaximumEncodeLength = (int.MaxValue / 4) * 3; // 1610612733
// Pre-computing this table using a custom string(s_characters) and GenerateEncodingMapAndVerify (found in tests)
private static ReadOnlySpan<byte> s_encodingMap => new byte[] {
65, 66, 67, 68, 69, 70, 71, 72, //A..H
73, 74, 75, 76, 77, 78, 79, 80, //I..P
81, 82, 83, 84, 85, 86, 87, 88, //Q..X
89, 90, 97, 98, 99, 100, 101, 102, //Y..Z, a..f
103, 104, 105, 106, 107, 108, 109, 110, //g..n
111, 112, 113, 114, 115, 116, 117, 118, //o..v
119, 120, 121, 122, 48, 49, 50, 51, //w..z, 0..3
52, 53, 54, 55, 56, 57, 45, 95 //4..9, -, _
};
private static ReadOnlySpan<sbyte> s_sseEncodeShuffleVec => new sbyte[] {
1, 0, 2, 1,
4, 3, 5, 4,
7, 6, 8, 7,
10, 9, 11, 10
};
private static ReadOnlySpan<sbyte> s_sseEncodeLut => new sbyte[] {
65, 71, -4, -4,
-4, -4, -4, -4,
-4, -4, -4, -4,
-19, -16, 0, 0
};
private static ReadOnlySpan<sbyte> s_avxEncodePermuteVec => new sbyte[] {
0, 0, 0, 0,
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
3, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0
};
private static ReadOnlySpan<sbyte> s_avxEncodeShuffleVec => new sbyte[] {
5, 4, 6, 5,
8, 7, 9, 8,
11, 10, 12, 11,
14, 13, 15, 14,
1, 0, 2, 1,
4, 3, 5, 4,
7, 6, 8, 7,
10, 9, 11, 10
};
private static ReadOnlySpan<sbyte> s_avxEncodeLut => new sbyte[] {
65, 71, -4, -4,
-4, -4, -4, -4,
-4, -4, -4, -4,
-19, -16, 0, 0,
65, 71, -4, -4,
-4, -4, -4, -4,
-4, -4, -4, -4,
-19, -16, 0, 0
};
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment