Last active
June 16, 2024 09:24
-
-
Save acaly/7f446eb9525ceb6d80ad30a92d1637e3 to your computer and use it in GitHub Desktop.
C# image manipulation using SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Running; | |
using System; | |
using System.Collections.Generic; | |
using System.Drawing; | |
using System.Linq; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace ImageManipulation | |
{ | |
public class Program | |
{ | |
/* | |
// * Summary * | |
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3737/23H2/2023Update/SunValley3) | |
13th Gen Intel Core i9-13900KF, 1 CPU, 32 logical and 24 physical cores | |
.NET SDK 8.0.100 | |
[Host] : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2 [AttachedDebugger] | |
DefaultJob : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2 | |
| Method | Mean | Error | StdDev | | |
|---------- |---------:|--------:|--------:| | |
| Run_Naive | 459.9 us | 2.16 us | 2.02 us | | |
| Run_128 | 246.0 us | 1.87 us | 1.75 us | | |
| Run_256 | 165.8 us | 3.25 us | 4.66 us | | |
// * Warnings * | |
Environment | |
Summary -> Benchmark was executed with attached debugger | |
*/ | |
private const int Size = 1080 * 768; | |
private static readonly byte[] _image = ReadImage(); | |
private readonly byte[] _input = new byte[Size * 3]; | |
private readonly byte[] _output = new byte[Size]; | |
private static byte[] ReadImage() | |
{ | |
byte[] ret = new byte[1080 * 768 * 3]; | |
var image = new Bitmap(@"test.png"); | |
for (int y = 0; y < 768; ++y) | |
{ | |
for (int x = 0; x < 1080; ++x) | |
{ | |
var w = ret.AsSpan().Slice(3 * (y * 1080 + x)); | |
var col = image.GetPixel(x, y); | |
w[0] = col.R; | |
w[1] = col.G; | |
w[2] = col.B; | |
} | |
} | |
return ret; | |
} | |
public Program() | |
{ | |
//Random.Shared.NextBytes(_input); | |
_input = _image.ToArray(); | |
} | |
private void Write() | |
{ | |
var image = new Bitmap(1080, 768, System.Drawing.Imaging.PixelFormat.Format24bppRgb); | |
for (int y = 0; y < 768; ++y) | |
{ | |
for (int x = 0; x < 1080; ++x) | |
{ | |
var w = _output[y * 1080 + x]; | |
Color c = Color.FromArgb(w, w, w); | |
image.SetPixel(x, y, c); | |
} | |
} | |
image.Save(@"test_output.png"); | |
} | |
public float R { get; set; } = 0.3f; | |
public float G { get; set; } = 0.4f; | |
public float B { get; set; } = 0.3f; | |
[Benchmark] | |
public void Run_Naive() | |
{ | |
for (int i = 0; i < Size; ++i) | |
{ | |
var r = (_input[i * 3 + 0] * (int)(256 * 0.7f)) >> 8; | |
var g = (_input[i * 3 + 1] * (int)(256 * 0.2f)) >> 8; | |
var b = (_input[i * 3 + 2] * (int)(256 * 0.1f)) >> 8; | |
_output[i] = (byte)(r + g + b); | |
} | |
} | |
[Benchmark] | |
public void Run_128() | |
{ | |
ref uint input = ref Unsafe.As<byte, uint>(ref _input[0]); | |
ref uint output = ref Unsafe.As<byte, uint>(ref _output[0]); | |
for (int i = 0; i < Size / 16; ++i) | |
{ | |
//about hadd: | |
//0--- -1-- --2- ---3 + 4--- -5-- --6- ---7 = 01-- --23 45-- --67 | |
//01-- --23 45-- --67 + ... = 0123 4567 .... | |
var read0 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12)); | |
var write0 = Conv(read0); | |
var read1 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 4)); | |
var write1 = Conv(Ssse3.AlignRight(read1, read0, 12)); | |
var sum01 = Ssse3.HorizontalAdd(Vector128.AsInt16(write0), Vector128.AsInt16(write1)); | |
var read2 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 8)); | |
var write2 = Conv(Ssse3.AlignRight(read2, read1, 8)); | |
var write3 = Conv(Ssse3.AlignRight(default, read2, 4)); | |
var sum23 = Ssse3.HorizontalAdd(Vector128.AsInt16(write2), Vector128.AsInt16(write3)); | |
var sum0123 = Ssse3.HorizontalAdd(sum01, sum23); | |
Vector128.StoreUnsafe(Vector128.AsUInt32(sum0123), ref Unsafe.Add(ref output, i * 4)); | |
} | |
} | |
[Benchmark] | |
public void Run_256() | |
{ | |
ref uint input = ref Unsafe.As<byte, uint>(ref _input[0]); | |
ref uint output = ref Unsafe.As<byte, uint>(ref _output[0]); | |
ushort r_scale_const = (ushort)(65536 * R); | |
ushort g_scale_const = (ushort)(65536 * G); | |
ushort b_scale_const = (ushort)(65536 * B); | |
var scale_vec_r = Vector256.Create( | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const, | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const, | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const, | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const); | |
var scale_vec_g = Vector256.Create( | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const, | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const, | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const, | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const); | |
var scale_vec_b = Vector256.Create( | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const, | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const, | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const, | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const); | |
for (int i = 0; i < Size / 16; ++i) | |
{ | |
var read0 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12)); | |
var read1 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 4)); | |
var aligned0 = read0; | |
var aligned1 = Ssse3.AlignRight(read1, read0, 12); | |
var read2 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 8)); | |
var aligned2 = Ssse3.AlignRight(read2, read1, 8); | |
var write02 = Conv(aligned0, aligned2, scale_vec_r, scale_vec_g, scale_vec_b); | |
var aligned3 = Ssse3.AlignRight(default, read2, 4); | |
var write13 = Conv(aligned1, aligned3, scale_vec_r, scale_vec_g, scale_vec_b); | |
var hadd0123 = Avx2.HorizontalAdd(Vector256.AsInt16(write02), Vector256.AsInt16(write13)); | |
var result = Ssse3.HorizontalAdd(hadd0123.GetLower(), hadd0123.GetUpper()); | |
Vector128.StoreUnsafe(Vector128.AsUInt32(result), ref Unsafe.Add(ref output, i * 4)); | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector128<uint> Conv(Vector128<uint> rgb_p3) | |
{ | |
const ushort r_scale_const = (ushort)(65536 * 0.3f); | |
const ushort g_scale_const = (ushort)(65536 * 0.4f); | |
const ushort b_scale_const = (ushort)(65536 * 0.3f); | |
Vector128<uint> sum; | |
{ | |
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10010000)); | |
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(0u, 24u, 16u, 8u)); | |
var scale_vec = Vector128.Create( | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const, | |
r_scale_const, r_scale_const, r_scale_const, r_scale_const); | |
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec)); | |
sum = scaled; | |
} | |
{ | |
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10010100)); | |
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(8u, 0u, 24u, 16u)); | |
var scale_vec = Vector128.Create( | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const, | |
g_scale_const, g_scale_const, g_scale_const, g_scale_const); | |
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec)); | |
sum += scaled; | |
} | |
{ | |
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10100100)); | |
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(16u, 8u, 0u, 24u)); | |
var scale_vec = Vector128.Create( | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const, | |
b_scale_const, b_scale_const, b_scale_const, b_scale_const); | |
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec)); | |
sum += scaled; | |
} | |
//TODO possibly saturation | |
return Avx2.ShiftLeftLogicalVariable(sum, Vector128.Create(0u, 8u, 16u, 24u)); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector256<uint> Conv(Vector128<uint> input1, Vector128<uint> input2, | |
Vector256<ushort> rscale, Vector256<ushort> gscale, Vector256<ushort> bscale) | |
{ | |
var input = Vector256.Create(input1, input2); | |
Vector256<uint> sum; | |
{ | |
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10010000)); | |
var masked = Vector256.BitwiseAnd(perm, | |
Vector256.Create(0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, | |
Vector256.Create(0u, 24u, 16u, 8u, 0u, 24u, 16u, 8u)); | |
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), rscale)); | |
sum = scaled; | |
} | |
{ | |
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10010100)); | |
var masked = Vector256.BitwiseAnd(perm, | |
Vector256.Create(0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, | |
Vector256.Create(8u, 0u, 24u, 16u, 8u, 0u, 24u, 16u)); | |
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), gscale)); | |
sum += scaled; | |
} | |
{ | |
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10100100)); | |
var masked = Vector256.BitwiseAnd(perm, | |
Vector256.Create(0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000)); | |
var shifted = Avx2.ShiftRightLogicalVariable(masked, | |
Vector256.Create(16u, 8u, 0u, 24u, 16u, 8u, 0u, 24u)); | |
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), bscale)); | |
sum += scaled; | |
} | |
//TODO possibly saturation | |
return Avx2.ShiftLeftLogicalVariable(sum, Vector256.Create(0u, 8u, 16u, 24u, 0u, 8u, 16u, 24u)); | |
} | |
static void Main() | |
{ | |
//var p = new Program(); | |
//p.Run_256(); | |
//p.Write(); | |
BenchmarkRunner.Run<Program>(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment