-
-
Save hypeartist/e4771578a77f08f2c88198130b53ef14 to your computer and use it in GitHub Desktop.
SimdQ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public unsafe struct PixelInfo | |
{ | |
public const int Shift = 8; | |
public const int Scale = 1 << Shift; | |
public const int Mask = Scale - 1; | |
public const int Msb = 1 << (Shift - 1); | |
public struct Cover | |
{ | |
public const int Shift = 8; | |
public const int Size = 1 << Shift; | |
public const int Mask = Size - 1; | |
public const int Mask2 = Mask * Mask; | |
public const int None = 0; | |
public const int Full = Mask; | |
public const int Full2 = Full * Full; | |
} | |
internal const int PixelSize = 4; | |
private const int PixelSizeBitCount = PixelSize << 3; | |
internal const int R = 2; | |
internal const int G = 1; | |
internal const int B = 0; | |
internal const int A = 3; | |
internal const int ShiftR = R << 3; | |
internal const int ShiftG = G << 3; | |
internal const int ShiftB = B << 3; | |
internal const int ShiftA = A << 3; | |
private const ulong ShuffleMaskR12 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 1) << PixelSizeBitCount) | (R + PixelSize * 0); | |
private const ulong ShuffleMaskR34 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 3) << PixelSizeBitCount) | (R + PixelSize * 2); | |
private const ulong ShuffleMaskR56 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 5) << PixelSizeBitCount) | (R + PixelSize * 4); | |
private const ulong ShuffleMaskR78 = 0xffffff00_ffffff00 | ((ulong)(R + PixelSize * 7) << PixelSizeBitCount) | (R + PixelSize * 6); | |
private const ulong ShuffleMaskG12 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 1) << PixelSizeBitCount) | (G + PixelSize * 0); | |
private const ulong ShuffleMaskG34 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 3) << PixelSizeBitCount) | (G + PixelSize * 2); | |
private const ulong ShuffleMaskG56 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 5) << PixelSizeBitCount) | (G + PixelSize * 4); | |
private const ulong ShuffleMaskG78 = 0xffffff00_ffffff00 | ((ulong)(G + PixelSize * 7) << PixelSizeBitCount) | (G + PixelSize * 6); | |
private const ulong ShuffleMaskB12 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 1) << PixelSizeBitCount) | (B + PixelSize * 0); | |
private const ulong ShuffleMaskB34 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 3) << PixelSizeBitCount) | (B + PixelSize * 2); | |
private const ulong ShuffleMaskB56 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 5) << PixelSizeBitCount) | (B + PixelSize * 4); | |
private const ulong ShuffleMaskB78 = 0xffffff00_ffffff00 | ((ulong)(B + PixelSize * 7) << PixelSizeBitCount) | (B + PixelSize * 6); | |
private const ulong ShuffleMaskA12 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 1) << PixelSizeBitCount) | (A + PixelSize * 0); | |
private const ulong ShuffleMaskA34 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 3) << PixelSizeBitCount) | (A + PixelSize * 2); | |
private const ulong ShuffleMaskA56 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 5) << PixelSizeBitCount) | (A + PixelSize * 4); | |
private const ulong ShuffleMaskA78 = 0xffffff00_ffffff00 | ((ulong)(A + PixelSize * 7) << PixelSizeBitCount) | (A + PixelSize * 6); | |
private const int Size = 256; | |
internal const int VecSize128 = Size >> 2; | |
internal const int VecSize256 = Size >> 3; | |
internal readonly struct Vector128Data | |
{ | |
private static Vector128X256 _data128; | |
public readonly Vector128<byte> ShuffleMaskR; | |
public readonly Vector128<byte> ShuffleMaskG; | |
public readonly Vector128<byte> ShuffleMaskB; | |
public readonly Vector128<byte> ShuffleMaskA; | |
public readonly Vector128<byte> ShuffleMaskC; | |
public readonly Vector128<int> ShuffleMaskM; | |
public readonly Vector128<int> ShuffleMask1; | |
public Vector128<int>* GradientCoefficients => (Vector128<int>*)Unsafe.AsPointer(ref _data128); | |
public Vector128Data(bool _) | |
{ | |
ShuffleMaskR = Vector128.Create(ShuffleMaskR12, ShuffleMaskR34).AsByte(); | |
ShuffleMaskG = Vector128.Create(ShuffleMaskG12, ShuffleMaskG34).AsByte(); | |
ShuffleMaskB = Vector128.Create(ShuffleMaskB12, ShuffleMaskB34).AsByte(); | |
ShuffleMaskA = Vector128.Create(ShuffleMaskA12, ShuffleMaskA34).AsByte(); | |
ShuffleMaskC = Vector128.Create(0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02).AsByte(); | |
ShuffleMaskM = Vector128.Create(Mask); | |
ShuffleMask1 = Vector128.Create(1); | |
var data128 = (Vector128<int>*)Unsafe.AsPointer(ref _data128); | |
for (int i = 0, j = 0; i < VecSize128; i++, j += 4) | |
{ | |
data128[i] = Vector128.Create(MathHelpers.RoundToU32(j / 255.0 * Scale), MathHelpers.RoundToU32((j + 1) / 255.0 * Scale), MathHelpers.RoundToU32((j + 2) / 255.0 * Scale), MathHelpers.RoundToU32((j + 3) / 255.0 * Scale)); | |
} | |
} | |
} | |
internal readonly struct Vector256Data | |
{ | |
private static Vector256X256 _data256; | |
public readonly Vector256<byte> ShuffleMaskR; | |
public readonly Vector256<byte> ShuffleMaskG; | |
public readonly Vector256<byte> ShuffleMaskB; | |
public readonly Vector256<byte> ShuffleMaskA; | |
public readonly Vector256<int> ShuffleMaskM; | |
public readonly Vector256<int> ShuffleMask1; | |
public Vector256<int>* GradientCoefficients => (Vector256<int>*)Unsafe.AsPointer(ref _data256); | |
public Vector256Data(bool _) | |
{ | |
ShuffleMaskR = Vector256.Create(ShuffleMaskR12, ShuffleMaskR34, ShuffleMaskR56, ShuffleMaskR78).AsByte(); | |
ShuffleMaskG = Vector256.Create(ShuffleMaskG12, ShuffleMaskG34, ShuffleMaskG56, ShuffleMaskG78).AsByte(); | |
ShuffleMaskB = Vector256.Create(ShuffleMaskB12, ShuffleMaskB34, ShuffleMaskB56, ShuffleMaskB78).AsByte(); | |
ShuffleMaskA = Vector256.Create(ShuffleMaskA12, ShuffleMaskA34, ShuffleMaskA56, ShuffleMaskA78).AsByte(); | |
ShuffleMaskM = Vector256.Create(Mask); | |
ShuffleMask1 = Vector256.Create(1); | |
var data256 = (Vector256<int>*)Unsafe.AsPointer(ref _data256); | |
for (int i = 0, j = 0; i < VecSize256; i++, j += 8) | |
{ | |
data256[i] = Vector256.Create(MathHelpers.RoundToU32(j / 255.0 * Scale), MathHelpers.RoundToU32((j + 1) / 255.0 * Scale), MathHelpers.RoundToU32((j + 2) / 255.0 * Scale), MathHelpers.RoundToU32((j + 3) / 255.0 * Scale), MathHelpers.RoundToU32((j + 4) / 255.0 * Scale), MathHelpers.RoundToU32((j + 5) / 255.0 * Scale), MathHelpers.RoundToU32((j + 6) / 255.0 * Scale), MathHelpers.RoundToU32((j + 7) / 255.0 * Scale)); | |
} | |
} | |
} | |
[StructLayout(LayoutKind.Sequential, Size = 16 * 256)] | |
private readonly struct Vector128X256 | |
{ | |
} | |
[StructLayout(LayoutKind.Sequential, Size = 32 * 256)] | |
private readonly struct Vector256X256 | |
{ | |
} | |
internal static readonly Vector128Data Simd4; | |
internal static readonly Vector256Data Simd8; | |
static PixelInfo() | |
{ | |
Simd4 = new Vector128Data(true); | |
Simd8 = new Vector256Data(true); | |
} | |
private static readonly PixelInfo WarmUpInstace = new PixelInfo(); | |
public static PixelInfo WarmUp() => WarmUpInstace; | |
} | |
Usage: | |
public readonly unsafe struct BlenderColor : IBlenderColor | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixels(Color* dst, int length, Color src, byte* covers) | |
{ | |
if (Avx2.IsSupported && length >= 8) | |
{ | |
BlendPixelSimd8(ref dst, ref length, src, ref covers); | |
if (length >= 4) | |
{ | |
BlendPixelSimd4(ref dst, ref length, src, ref covers); | |
} | |
} | |
else if (Sse41.IsSupported && length >= 4) | |
{ | |
BlendPixelSimd4(ref dst, ref length, src, ref covers); | |
} | |
if (length == 0) return; | |
do | |
{ | |
var alpha = (src.A * (*covers + 1)) >> 8; | |
if (alpha == PixelInfo.Mask) | |
{ | |
*dst = src; | |
dst++; | |
} | |
else | |
{ | |
BlendPixel(dst, src.R, src.G, src.B, alpha, *covers); | |
} | |
dst++; | |
covers++; | |
} while (--length != 0); | |
} | |
public void BlendPixels(Color* dst, int length, Color* src, byte* covers) | |
{ | |
if (Avx2.IsSupported && length >= 8) | |
{ | |
BlendPixelSimd8(ref dst, ref length, ref src, ref covers); | |
if (length >= 4) | |
{ | |
BlendPixelSimd4(ref dst, ref length, ref src, ref covers); | |
} | |
} | |
else if (Sse41.IsSupported && length >= 4) | |
{ | |
BlendPixelSimd4(ref dst, ref length, ref src, ref covers); | |
} | |
if (length == 0) return; | |
do | |
{ | |
var alpha = ((*src).A * (*covers + 1)) >> 8; | |
if (alpha == PixelInfo.Mask) | |
{ | |
*dst = *src; | |
dst++; | |
} | |
else | |
{ | |
BlendPixel(dst, (*src).R, (*src).G, (*src).B, alpha, *covers); | |
} | |
dst++; | |
covers++; | |
} while (--length != 0); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixel(Color* dst, int r, int g, int b, int a) | |
{ | |
var dr = dst->R; | |
var dg = dst->G; | |
var db = dst->B; | |
var da = dst->A; | |
dst->R = (byte)((a * (r - dr) + (dr << PixelInfo.Shift)) >> PixelInfo.Shift); | |
dst->G = (byte)((a * (g - dg) + (dg << PixelInfo.Shift)) >> PixelInfo.Shift); | |
dst->B = (byte)((a * (b - db) + (db << PixelInfo.Shift)) >> PixelInfo.Shift); | |
dst->A = (byte)(a + da - ((a * da + PixelInfo.Mask) >> PixelInfo.Shift)); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixel(Color* dst, int r, int g, int b, int a, int cover) => BlendPixel(dst, r, g, b, a); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixelSimd4(ref Color* dst, ref int length, Color src, ref byte* covers) | |
{ | |
var vSrc = Vector128.Create(*(int*)&src).AsByte(); | |
var vSrcA = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskA).AsInt32(); | |
var stop = (uint)length & 3; | |
while ((uint)length > stop) | |
{ | |
var vRes = Blend4(*(Vector128<byte>*)dst, vSrc, vSrcA, Ssse3.Shuffle(*(Vector128<byte>*)covers, PixelInfo.Simd4.ShuffleMaskC).AsInt32()); | |
Sse2.Store((int*)dst, vRes); | |
length -= 4; | |
dst += 4; | |
covers += 4; | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixelSimd4(ref Color* dst, ref int length, ref Color* src, ref byte* covers) | |
{ | |
var stop = (uint)length & 3; | |
while ((uint)length > stop) | |
{ | |
var vSrc = *(Vector128<byte>*) src; | |
var vSrcA = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskA).AsInt32(); | |
var vRes = Blend4(*(Vector128<byte>*)dst, vSrc, vSrcA, Ssse3.Shuffle(*(Vector128<byte>*)covers, PixelInfo.Simd4.ShuffleMaskC).AsInt32()); | |
Sse2.Store((int*)dst, vRes); | |
length -= 4; | |
dst += 4; | |
src += 4; | |
covers += 4; | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector128<int> Blend4(Vector128<byte> vDst, Vector128<byte> vSrc, Vector128<int> vSrcA, Vector128<int> vCvr) | |
{ | |
var vSrcAc = Sse2.ShiftRightLogical(Sse41.MultiplyLow(Sse2.Add(vCvr, PixelInfo.Simd4.ShuffleMask1), vSrcA), PixelInfo.Shift); | |
var vSrcR = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskR).AsInt32(); | |
var vDstR = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskR).AsInt32(); | |
var vResR = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcR, vDstR), vSrcAc), Sse2.ShiftLeftLogical(vDstR, PixelInfo.Shift)), PixelInfo.Shift); | |
var vSrcG = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskG).AsInt32(); | |
var vDstG = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskG).AsInt32(); | |
var vResG = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcG, vDstG), vSrcAc), Sse2.ShiftLeftLogical(vDstG, PixelInfo.Shift)), PixelInfo.Shift); | |
var vSrcB = Ssse3.Shuffle(vSrc, PixelInfo.Simd4.ShuffleMaskB).AsInt32(); | |
var vDstB = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskB).AsInt32(); | |
var vResB = Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(Sse2.Subtract(vSrcB, vDstB), vSrcAc), Sse2.ShiftLeftLogical(vDstB, PixelInfo.Shift)), PixelInfo.Shift); | |
var vDstA = Ssse3.Shuffle(vDst, PixelInfo.Simd4.ShuffleMaskA).AsInt32(); | |
var vResA = Sse2.Subtract(Sse2.Add(vSrcAc, vDstA), Sse2.ShiftRightLogical(Sse2.Add(Sse41.MultiplyLow(vSrcAc, vDstA), PixelInfo.Simd4.ShuffleMaskM), PixelInfo.Shift)); | |
var rr = Sse2.ShiftLeftLogical(vResR, PixelInfo.ShiftR); | |
var gg = Sse2.ShiftLeftLogical(vResG, PixelInfo.ShiftG); | |
var bb = Sse2.ShiftLeftLogical(vResB, PixelInfo.ShiftB); | |
var aa = Sse2.ShiftLeftLogical(vResA, PixelInfo.ShiftA); | |
return Sse2.Or(rr, Sse2.Or(gg, Sse2.Or(bb, aa))); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixelSimd8(ref Color* dst, ref int length, Color src, ref byte* covers) | |
{ | |
var vSrc = Vector256.Create(*(int*)&src).AsByte(); | |
var vSrcA = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskA).AsInt32(); | |
var stop = (uint)length & 7; | |
while ((uint)length > stop) | |
{ | |
var vRes = BlendSimd8(*(Vector256<byte>*)dst, vSrc, vSrcA, Avx2.ConvertToVector256Int32(*(Vector128<byte>*)covers)); | |
Avx.Store((int*)dst, vRes); | |
length -= 8; | |
dst += 8; | |
covers += 8; | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public void BlendPixelSimd8(ref Color* dst, ref int length, ref Color* src, ref byte* covers) | |
{ | |
var stop = (uint)length & 7; | |
while ((uint)length > stop) | |
{ | |
var vSrc = *(Vector256<byte>*) src; | |
var vSrcA = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskA).AsInt32(); | |
var vRes = BlendSimd8(*(Vector256<byte>*)dst, vSrc, vSrcA, Avx2.ConvertToVector256Int32(*(Vector128<byte>*)covers)); | |
Avx.Store((int*)dst, vRes); | |
length -= 8; | |
dst += 8; | |
src += 8; | |
covers += 8; | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector256<int> BlendSimd8(Vector256<byte> vDst, Vector256<byte> vSrc, Vector256<int> vSrcA, Vector256<int> vCvr) | |
{ | |
var vSrcAc = Avx2.ShiftRightLogical(Avx2.MultiplyLow(Avx2.Add(vCvr, PixelInfo.Simd8.ShuffleMask1), vSrcA), PixelInfo.Shift); | |
var vSrcR = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskR).AsInt32(); | |
var vDstR = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskR).AsInt32(); | |
var vResR = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcR, vDstR), vSrcAc), Avx2.ShiftLeftLogical(vDstR, PixelInfo.Shift)), PixelInfo.Shift); | |
var vSrcG = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskG).AsInt32(); | |
var vDstG = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskG).AsInt32(); | |
var vResG = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcG, vDstG), vSrcAc), Avx2.ShiftLeftLogical(vDstG, PixelInfo.Shift)), PixelInfo.Shift); | |
var vSrcB = Avx2.Shuffle(vSrc, PixelInfo.Simd8.ShuffleMaskB).AsInt32(); | |
var vDstB = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskB).AsInt32(); | |
var vResB = Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(Avx2.Subtract(vSrcB, vDstB), vSrcAc), Avx2.ShiftLeftLogical(vDstB, PixelInfo.Shift)), PixelInfo.Shift); | |
// | |
var vDstA = Avx2.Shuffle(vDst, PixelInfo.Simd8.ShuffleMaskA).AsInt32(); | |
var vResA = Avx2.Subtract(Avx2.Add(vSrcAc, vDstA), Avx2.ShiftRightLogical(Avx2.Add(Avx2.MultiplyLow(vSrcAc, vDstA), PixelInfo.Simd8.ShuffleMaskM), PixelInfo.Shift)); | |
// | |
var rr = Avx2.ShiftLeftLogical(vResR, PixelInfo.ShiftR); | |
var gg = Avx2.ShiftLeftLogical(vResG, PixelInfo.ShiftG); | |
var bb = Avx2.ShiftLeftLogical(vResB, PixelInfo.ShiftB); | |
var aa = Avx2.ShiftLeftLogical(vResA, PixelInfo.ShiftA); | |
return Avx2.Or(rr, Avx2.Or(gg, Avx2.Or(bb, aa))); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment