Skip to content

Instantly share code, notes, and snippets.

@andrewmd5
Created March 15, 2023 18:28
Show Gist options
  • Save andrewmd5/0532fbe2ccd85e504abcd923d8eb94b3 to your computer and use it in GitHub Desktop.
Save andrewmd5/0532fbe2ccd85e504abcd923d8eb94b3 to your computer and use it in GitHub Desktop.
Detect if a byte-array or ReadOnlySpan<byte> contains only ASCII characters
using System;
using System.Buffers;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Text;
namespace idk
{
/// <summary>
/// AsciiChecker is a utility class that provides cross-platform support (x86, ARM, ARM64, and software)
/// for detecting if a byte-array or ReadOnlySpan<byte> contains only ASCII characters.
/// </summary>
public static class AsciiChecker
{
/// <summary>
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters.
/// The method uses hardware intrinsics when available on x86, ARM, and ARM64 platforms.
/// If hardware intrinsics are not available, it falls back to a software-based implementation.
/// </summary>
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param>
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns>
public static bool ContainsOnlyAscii(ReadOnlySpan<byte> data)
{
if (Sse2.IsSupported)
{
return ContainsOnlyAscii_X86(data);
}
else if (AdvSimd.Arm64.IsSupported)
{
return ContainsOnlyAscii_Arm64(data);
}
else if (AdvSimd.IsSupported)
{
return ContainsOnlyAscii_Arm(data);
}
else
{
return ContainsOnlyAscii_SoftwareFallback(data);
}
}
/// <summary>
/// Determines if the given ReadOnlySpan of char contains only ASCII characters.
/// </summary>
/// <param name="data">The input ReadOnlySpan of char.</param>
/// <returns>Returns true if all characters in the input span are ASCII characters; otherwise, returns false.</returns>
/// <remarks>
/// This method first converts the input ReadOnlySpan<char> to a byte representation using UTF-8 encoding without
/// allocating memory. Then, it checks if the byte representation contains only ASCII characters by calling
/// the ContainsOnlyAscii(ReadOnlySpan<byte>) method.
///
/// If the input span is larger than a predefined threshold (MaxStackSize), the method uses an ArrayPool
/// to rent a buffer for the byte representation. Otherwise, it uses stack allocation. If a buffer is rented,
/// it will be returned to the ArrayPool after use.
/// </remarks>
public static bool ContainsOnlyAscii(ReadOnlySpan<char> data)
{
const int MaxStackSize = 256;
int length = data.Length;
int maxLength = Encoding.UTF8.GetMaxByteCount(length);
byte[]? rentedBuffer = null;
Span<byte> byteData = maxLength > MaxStackSize
? (rentedBuffer = ArrayPool<byte>.Shared.Rent(maxLength))
: stackalloc byte[MaxStackSize];
try
{
// Convert ReadOnlySpan<char> to Span<byte> using Encoding.UTF8 without allocating memory
int bytesWritten = Encoding.UTF8.GetBytes(data, byteData);
// Resize the Span<byte> to the actual number of bytes written
byteData = byteData[..bytesWritten];
// Check if the byte representation contains only ASCII characters
return ContainsOnlyAscii(byteData);
}
finally
{
// Return the rented buffer to the ArrayPool, if applicable
if (rentedBuffer != null)
{
ArrayPool<byte>.Shared.Return(rentedBuffer, clearArray: true);
}
}
}
/// <summary>
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using ARM64 hardware intrinsics.
/// </summary>
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param>
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns>
private static unsafe bool ContainsOnlyAscii_Arm64(ReadOnlySpan<byte> data)
{
int i = 0;
int length = data.Length;
// Calculate the index of the last block that can be processed using Vector128<byte>
int lastBlockIndex = length - (length % Vector128<byte>.Count);
// Create a mask with the high bit set (0x80) to test for non-ASCII characters
Vector128<byte> asciiMask = Vector128.Create((byte)0x80);
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer
fixed (byte* dataPtr = data)
{
// Iterate over the data in blocks of Vector128<byte>.Count (16 bytes for ARM64)
for (; i < lastBlockIndex; i += Vector128<byte>.Count)
{
// Load the current block into a Vector128<byte> using a pointer
Vector128<byte> block = AdvSimd.LoadVector128(dataPtr + i);
// Perform a bitwise AND with the ASCII mask
Vector128<byte> masked = AdvSimd.And(block, asciiMask);
// Check if the masked vector has any bytes with the high bit set (non-ASCII character)
if (AdvSimd.CompareEqual(masked, Vector128<byte>.Zero).AsByte().AsInt64().ToScalar() != -1)
{
return false;
}
}
// Process the remaining bytes using ARM64 hardware intrinsics
if (i < length)
{
int remaining = length - i;
byte* paddingPtr = stackalloc byte[Vector128<byte>.Count];
Span<byte> paddingSpan = new Span<byte>(paddingPtr, Vector128<byte>.Count);
// Zero out the padding buffer
//paddingSpan.Clear();
// Copy the remaining bytes into the padding buffer
data.Slice(i).CopyTo(paddingSpan);
Vector128<byte> block = AdvSimd.LoadVector128(paddingPtr);
Vector128<byte> masked = AdvSimd.And(block, asciiMask);
if (AdvSimd.CompareEqual(masked, Vector128<byte>.Zero).AsByte().AsInt64().ToScalar() != -1)
{
return false;
}
}
}
return true;
}
/// <summary>
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using ARM hardware intrinsics.
/// </summary>
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param>
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns>
private static unsafe bool ContainsOnlyAscii_Arm(ReadOnlySpan<byte> data)
{
int i = 0;
int length = data.Length;
// Calculate the index of the last block that can be processed using Vector64<byte>
int lastBlockIndex = length - (length % Vector64<byte>.Count);
// Create a mask with the high bit set (0x80) to test for non-ASCII characters
Vector64<byte> asciiMask = Vector64.Create((byte)0x80);
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer
fixed (byte* dataPtr = data)
{
// Iterate over the data in blocks of Vector64<byte>.Count (8 bytes for ARM)
for (; i < lastBlockIndex; i += Vector64<byte>.Count)
{
// Load the current block into a Vector64<byte> using a pointer
Vector64<byte> block = AdvSimd.LoadVector64(dataPtr + i);
// Perform a bitwise AND with the ASCII mask
Vector64<byte> masked = AdvSimd.And(block, asciiMask);
// Check if the masked vector has any bytes with the high bit set (non-ASCII character)
if (AdvSimd.CompareEqual(masked, Vector64<byte>.Zero).AsByte().AsInt32().ToScalar() != -1)
{
return false;
}
}
// Process the remaining bytes using ARM hardware intrinsics
if (i < length)
{
int remaining = length - i;
byte* paddingPtr = stackalloc byte[Vector64<byte>.Count];
Span<byte> paddingSpan = new Span<byte>(paddingPtr, Vector64<byte>.Count);
// Zero out the padding buffer
//paddingSpan.Clear();
// Copy the remaining bytes into the padding buffer
data.Slice(i).CopyTo(paddingSpan);
Vector64<byte> block = AdvSimd.LoadVector64(paddingPtr);
Vector64<byte> masked = AdvSimd.And(block, asciiMask);
if (AdvSimd.CompareEqual(masked, Vector64<byte>.Zero).AsByte().AsInt32().ToScalar() != -1)
{
return false;
}
}
}
return true;
}
/// <summary>
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using x86 hardware intrinsics.
/// </summary>
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param>
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns>
private static unsafe bool ContainsOnlyAscii_X86(ReadOnlySpan<byte> data)
{
int i = 0;
int length = data.Length;
// Calculate the index of the last block that can be processed using Vector128<byte>
int lastBlockIndex = length - (length % Vector128<byte>.Count);
// Create a mask with the high bit set (0x80) to test for non-ASCII characters
Vector128<byte> asciiMask = Vector128.Create((byte)0x80);
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer
fixed (byte* dataPtr = data)
{
// Iterate over the data in blocks of Vector128<byte>.Count (16 bytes for x86)
for (; i < lastBlockIndex; i += Vector128<byte>.Count)
{
// Load the current block into a Vector128<byte> using a pointer
Vector128<byte> block = Sse2.LoadVector128(dataPtr + i);
// Perform a bitwise AND with the ASCII mask
Vector128<byte> masked = Sse2.And(block, asciiMask);
// Use Sse2.MoveMask to create an integer mask from the most significant bit of each byte
// in the masked Vector128<byte>
int mask = Sse2.MoveMask(masked.AsSByte());
// Test if any of the masked bytes have the high bit set (non-ASCII character)
if (mask != 0)
{
return false;
}
}
}
return true;
}
/// <summary>
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using a software-based implementation.
/// This implementation uses a for loop to access elements in the ReadOnlySpan<byte>, allowing the compiler to
/// optimize the code more aggressively, such as bypassing bound checks.
/// </summary>
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param>
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns>
private static bool ContainsOnlyAscii_SoftwareFallback(ReadOnlySpan<byte> data)
{
int length = data.Length;
for (int i = 0; i < length; i++)
{
if (data[i] >= 0x80)
{
return false;
}
}
return true;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment