Skip to content

Instantly share code, notes, and snippets.

@hypeartist
Last active May 23, 2020 22:52
Show Gist options
  • Save hypeartist/e4771578a77f08f2c88198130b53ef14 to your computer and use it in GitHub Desktop.
Save hypeartist/e4771578a77f08f2c88198130b53ef14 to your computer and use it in GitHub Desktop.
SimdQ
public unsafe struct PixelInfo
{
public const int Shift = 8;
public const int Scale = 1 << Shift;
public const int Mask = Scale - 1;
public const int Msb = 1 << (Shift - 1);
public struct Cover
{
public const int Shift = 8;
public const int Size = 1 << Shift;
public const int Mask = Size - 1;
public const int Mask2 = Mask * Mask;
public const int None = 0;
public const int Full = Mask;
public const int Full2 = Full * Full;
}
internal const int PixelSize = 4;
private const int PixelSizeBitCount = PixelSize << 3;
internal const int R = 2;
internal const int G = 1;
internal const int B = 0;
internal const int A = 3;
internal const int ShiftR = R << 3;
internal const int ShiftG = G << 3;
internal const int ShiftB = B << 3;
internal const int ShiftA = A << 3;
private const ulong ShuffleMaskR12 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 1) << PixelSizeBitCount) | (R + PixelSize * 0);
private const ulong ShuffleMaskR34 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 3) << PixelSizeBitCount) | (R + PixelSize * 2);
private const ulong ShuffleMaskR56 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 5) << PixelSizeBitCount) | (R + PixelSize * 4);
private const ulong ShuffleMaskR78 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 7) << PixelSizeBitCount) | (R + PixelSize * 6);
private const ulong ShuffleMaskG12 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 1) << PixelSizeBitCount) | (G + PixelSize * 0);
private const ulong ShuffleMaskG34 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 3) << PixelSizeBitCount) | (G + PixelSize * 2);
private const ulong ShuffleMaskG56 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 5) << PixelSizeBitCount) | (G + PixelSize * 4);
private const ulong ShuffleMaskG78 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 7) << PixelSizeBitCount) | (G + PixelSize * 6);
private const ulong ShuffleMaskB12 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 1) << PixelSizeBitCount) | (B + PixelSize * 0);
private const ulong ShuffleMaskB34 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 3) << PixelSizeBitCount) | (B + PixelSize * 2);
private const ulong ShuffleMaskB56 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 5) << PixelSizeBitCount) | (B + PixelSize * 4);
private const ulong ShuffleMaskB78 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 7) << PixelSizeBitCount) | (B + PixelSize * 6);
private const ulong ShuffleMaskA12 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 1) << PixelSizeBitCount) | (A + PixelSize * 0);
private const ulong ShuffleMaskA34 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 3) << PixelSizeBitCount) | (A + PixelSize * 2);
private const ulong ShuffleMaskA56 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 5) << PixelSizeBitCount) | (A + PixelSize * 4);
private const ulong ShuffleMaskA78 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 7) << PixelSizeBitCount) | (A + PixelSize * 6);
private const int Size = 256;
internal const int VecSize128 = Size >> 2;
internal const int VecSize256 = Size >> 3;
internal readonly struct Vector128Data
{
private static Vector128X256 _data128;
public readonly Vector128<byte> ShuffleMaskR;
public readonly Vector128<byte> ShuffleMaskG;
public readonly Vector128<byte> ShuffleMaskB;
public readonly Vector128<byte> ShuffleMaskA;
public readonly Vector128<byte> ShuffleMaskC;
public readonly Vector128<int> ShuffleMaskM;
public readonly Vector128<int> ShuffleMask1;
public Vector128<int>* GradientCoefficients => (Vector128<int>*)Unsafe.AsPointer(ref _data128);
public Vector128Data(bool _)
{
ShuffleMaskR = Vector128.Create(ShuffleMaskR12, ShuffleMaskR34).AsByte();
ShuffleMaskG = Vector128.Create(ShuffleMaskG12, ShuffleMaskG34).AsByte();
ShuffleMaskB = Vector128.Create(ShuffleMaskB12, ShuffleMaskB34).AsByte();
ShuffleMaskA = Vector128.Create(ShuffleMaskA12, ShuffleMaskA34).AsByte();
ShuffleMaskC = Vector128.Create(0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02).AsByte();
ShuffleMaskM = Vector128.Create(Mask);
ShuffleMask1 = Vector128.Create(1);
var data128 = (Vector128<int>*)Unsafe.AsPointer(ref _data128);
for (int i = 0, j = 0; i < VecSize128; i++, j += 4)
{
data128[i] = Vector128.Create(MathHelpers.RoundToU32(j / 255.0 * Scale), MathHelpers.RoundToU32((j + 1) / 255.0 * Scale), MathHelpers.RoundToU32((j + 2) / 255.0 * Scale), MathHelpers.RoundToU32((j + 3) / 255.0 * Scale));
}
}
}
internal readonly struct Vector256Data
{
private static Vector256X256 _data256;
public readonly Vector256<byte> ShuffleMaskR;
public readonly Vector256<byte> ShuffleMaskG;
public readonly Vector256<byte> ShuffleMaskB;
public readonly Vector256<byte> ShuffleMaskA;
public readonly Vector256<int> ShuffleMaskM;
public readonly Vector256<int> ShuffleMask1;
public Vector256<int>* GradientCoefficients => (Vector256<int>*)Unsafe.AsPointer(ref _data256);
public Vector256Data(bool _)
{
ShuffleMaskR = Vector256.Create(ShuffleMaskR12, ShuffleMaskR34, ShuffleMaskR56, ShuffleMaskR78).AsByte();
ShuffleMaskG = Vector256.Create(ShuffleMaskG12, ShuffleMaskG34, ShuffleMaskG56, ShuffleMaskG78).AsByte();
ShuffleMaskB = Vector256.Create(ShuffleMaskB12, ShuffleMaskB34, ShuffleMaskB56, ShuffleMaskB78).AsByte();
ShuffleMaskA = Vector256.Create(ShuffleMaskA12, ShuffleMaskA34, ShuffleMaskA56, ShuffleMaskA78).AsByte();
ShuffleMaskM = Vector256.Create(Mask);
ShuffleMask1 = Vector256.Create(1);
var data256 = (Vector256<int>*)Unsafe.AsPointer(ref _data256);
for (int i = 0, j = 0; i < VecSize256; i++, j += 8)
{
data256[i] = Vector256.Create(MathHelpers.RoundToU32(j / 255.0 * Scale), MathHelpers.RoundToU32((j + 1) / 255.0 * Scale), MathHelpers.RoundToU32((j + 2) / 255.0 * Scale), MathHelpers.RoundToU32((j + 3) / 255.0 * Scale), MathHelpers.RoundToU32((j + 4) / 255.0 * Scale), MathHelpers.RoundToU32((j + 5) / 255.0 * Scale), MathHelpers.RoundToU32((j + 6) / 255.0 * Scale), MathHelpers.RoundToU32((j + 7) / 255.0 * Scale));
}
}
}
[StructLayout(LayoutKind.Sequential, Size = 16 * 256)]
private readonly struct Vector128X256
{
}
[StructLayout(LayoutKind.Sequential, Size = 32 * 256)]
private readonly struct Vector256X256
{
}
internal static readonly Vector128Data Simd4;
internal static readonly Vector256Data Simd8;
static PixelInfo()
{
Simd4 = new Vector128Data(true);
Simd8 = new Vector256Data(true);
}
private static readonly PixelInfo WarmUpInstace = new PixelInfo();
public static PixelInfo WarmUp() => WarmUpInstace;
}
Usage:
public readonly unsafe struct BlenderColor : IBlenderColor
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixels(Color* dst, int length, Color src, byte* covers)
{
if (Avx2.IsSupported && length >= 8)
{
BlendPixelSimd8(ref dst, ref length, src, ref covers);
if (length >= 4)
{
BlendPixelSimd4(ref dst, ref length, src, ref covers);
}
}
else if (Sse41.IsSupported && length >= 4)
{
BlendPixelSimd4(ref dst, ref length, src, ref covers);
}
if (length == 0) return;
do
{
var alpha = (src.A * (*covers + 1)) >> 8;
if (alpha == PixelInfo.Mask)
{
*dst = src;
dst++;
}
else
{
BlendPixel(dst, src.R, src.G, src.B, alpha, *covers);
}
dst++;
covers++;
} while (--length != 0);
}
public void BlendPixels(Color* dst, int length, Color* src, byte* covers)
{
if (Avx2.IsSupported && length >= 8)
{
BlendPixelSimd8(ref dst, ref length, ref src, ref covers);
if (length >= 4)
{
BlendPixelSimd4(ref dst, ref length, ref src, ref covers);
}
}
else if (Sse41.IsSupported && length >= 4)
{
BlendPixelSimd4(ref dst, ref length, ref src, ref covers);
}
if (length == 0) return;
do
{
var alpha = ((*src).A * (*covers + 1)) >> 8;
if (alpha == PixelInfo.Mask)
{
*dst = *src;
dst++;
}
else
{
BlendPixel(dst, (*src).R, (*src).G, (*src).B, alpha, *covers);
}
dst++;
covers++;
} while (--length != 0);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixel(Color* dst, int r, int g, int b, int a)
{
var dr = dst->R;
var dg = dst->G;
var db = dst->B;
var da = dst->A;
dst->R = (byte)((a * (r - dr) + (dr << PixelInfo.Shift)) >> PixelInfo.Shift);
dst->G = (byte)((a * (g - dg) + (dg << PixelInfo.Shift)) >> PixelInfo.Shift);
dst->B = (byte)((a * (b - db) + (db << PixelInfo.Shift)) >> PixelInfo.Shift);
dst->A = (byte)(a + da - ((a * da + PixelInfo.Mask) >> PixelInfo.Shift));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixel(Color* dst, int r, int g, int b, int a, int cover) => BlendPixel(dst, r, g, b, a);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixelSimd4(ref Color* dst, ref int length, Color src, ref byte* covers)
{
var vSrc = Vector128.Create(*(int*)&src).AsByte();
var vSrcA = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskA).AsInt32();
var stop = (uint)length & 3;
while ((uint)length > stop)
{
var vRes = Blend4(*(Vector128<byte>*)dst, vSrc, vSrcA, Ssse3.Shuffle(*(Vector128<byte>*)covers, PixelInfo.Simd4.ShuffleMaskC).AsInt32());
Sse2.Store((int*)dst, vRes);
length -= 4;
dst += 4;
covers += 4;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixelSimd4(ref Color* dst, ref int length, ref Color* src, ref byte* covers)
{
var stop = (uint)length & 3;
while ((uint)length > stop)
{
var vSrc = *(Vector128<byte>*) src;
var vSrcA = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskA).AsInt32();
var vRes = Blend4(*(Vector128<byte>*)dst, vSrc, vSrcA, Ssse3.Shuffle(*(Vector128<byte>*)covers, PixelInfo.Simd4.ShuffleMaskC).AsInt32());
Sse2.Store((int*)dst, vRes);
length -= 4;
dst += 4;
src += 4;
covers += 4;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<int> Blend4(Vector128<byte> vDst, Vector128<byte> vSrc, Vector128<int> vSrcA, Vector128<int> vCvr)
{
var vSrcAc = Sse2.ShiftRightLogical(Sse41.MultiplyLow(Sse2.Add(vCvr, PixelInfo.Simd4.ShuffleMask1), vSrcA), PixelInfo.Shift);
var vSrcR = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskR).AsInt32();
var vDstR = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskR).AsInt32();
var vResR = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcR, vDstR), vSrcAc), Sse2.ShiftLeftLogical(vDstR, PixelInfo.Shift)), PixelInfo.Shift);
var vSrcG = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskG).AsInt32();
var vDstG = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskG).AsInt32();
var vResG = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcG, vDstG), vSrcAc), Sse2.ShiftLeftLogical(vDstG, PixelInfo.Shift)), PixelInfo.Shift);
var vSrcB = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskB).AsInt32();
var vDstB = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskB).AsInt32();
var vResB = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcB, vDstB), vSrcAc), Sse2.ShiftLeftLogical(vDstB, PixelInfo.Shift)), PixelInfo.Shift);
var vDstA = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskA).AsInt32();
var vResA = Sse2.Subtract(Sse2.Add(vSrcAc, vDstA), Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(vSrcAc, vDstA), PixelInfo.Simd4.ShuffleMaskM), PixelInfo.Shift));
var rr = Sse2.ShiftLeftLogical(vResR, PixelInfo.ShiftR);
var gg = Sse2.ShiftLeftLogical(vResG, PixelInfo.ShiftG);
var bb = Sse2.ShiftLeftLogical(vResB, PixelInfo.ShiftB);
var aa = Sse2.ShiftLeftLogical(vResA, PixelInfo.ShiftA);
return Sse2.Or(rr, Sse2.Or(gg, Sse2.Or(bb, aa)));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixelSimd8(ref Color* dst, ref int length, Color src, ref byte* covers)
{
var vSrc = Vector256.Create(*(int*)&src).AsByte();
var vSrcA = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskA).AsInt32();
var stop = (uint)length & 7;
while ((uint)length > stop)
{
var vRes = BlendSimd8(*(Vector256<byte>*)dst, vSrc, vSrcA, Avx2.ConvertToVector256Int32(*(Vector128<byte>*)covers));
Avx.Store((int*)dst, vRes);
length -= 8;
dst += 8;
covers += 8;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void BlendPixelSimd8(ref Color* dst, ref int length, ref Color* src, ref byte* covers)
{
var stop = (uint)length & 7;
while ((uint)length > stop)
{
var vSrc = *(Vector256<byte>*) src;
var vSrcA = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskA).AsInt32();
var vRes = BlendSimd8(*(Vector256<byte>*)dst, vSrc, vSrcA, Avx2.ConvertToVector256Int32(*(Vector128<byte>*)covers));
Avx.Store((int*)dst, vRes);
length -= 8;
dst += 8;
src += 8;
covers += 8;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> BlendSimd8(Vector256<byte> vDst, Vector256<byte> vSrc, Vector256<int> vSrcA, Vector256<int> vCvr)
{
var vSrcAc = Avx2.ShiftRightLogical(Avx2.MultiplyLow(Avx2.Add(vCvr, PixelInfo.Simd8.ShuffleMask1), vSrcA), PixelInfo.Shift);
var vSrcR = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskR).AsInt32();
var vDstR = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskR).AsInt32();
var vResR = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcR, vDstR), vSrcAc), Avx2.ShiftLeftLogical(vDstR, PixelInfo.Shift)), PixelInfo.Shift);
var vSrcG = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskG).AsInt32();
var vDstG = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskG).AsInt32();
var vResG = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcG, vDstG), vSrcAc), Avx2.ShiftLeftLogical(vDstG, PixelInfo.Shift)), PixelInfo.Shift);
var vSrcB = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskB).AsInt32();
var vDstB = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskB).AsInt32();
var vResB = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcB, vDstB), vSrcAc), Avx2.ShiftLeftLogical(vDstB, PixelInfo.Shift)), PixelInfo.Shift);
//
var vDstA = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskA).AsInt32();
var vResA = Avx2.Subtract(Avx2.Add(vSrcAc, vDstA), Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(vSrcAc, vDstA), PixelInfo.Simd8.ShuffleMaskM), PixelInfo.Shift));
//
var rr = Avx2.ShiftLeftLogical(vResR, PixelInfo.ShiftR);
var gg = Avx2.ShiftLeftLogical(vResG, PixelInfo.ShiftG);
var bb = Avx2.ShiftLeftLogical(vResB, PixelInfo.ShiftB);
var aa = Avx2.ShiftLeftLogical(vResA, PixelInfo.ShiftA);
return Avx2.Or(rr, Avx2.Or(gg, Avx2.Or(bb, aa)));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment