Skip to content

Instantly share code, notes, and snippets.

@acaly
Last active June 16, 2024 09:24
Show Gist options
  • Save acaly/7f446eb9525ceb6d80ad30a92d1637e3 to your computer and use it in GitHub Desktop.
Save acaly/7f446eb9525ceb6d80ad30a92d1637e3 to your computer and use it in GitHub Desktop.
C# image manipulation using SIMD
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
using System.Threading.Tasks;
namespace ImageManipulation
{
public class Program
{
/*
// * Summary *
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3737/23H2/2023Update/SunValley3)
13th Gen Intel Core i9-13900KF, 1 CPU, 32 logical and 24 physical cores
.NET SDK 8.0.100
[Host] : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2 [AttachedDebugger]
DefaultJob : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2
| Method | Mean | Error | StdDev |
|---------- |---------:|--------:|--------:|
| Run_Naive | 459.9 us | 2.16 us | 2.02 us |
| Run_128 | 246.0 us | 1.87 us | 1.75 us |
| Run_256 | 165.8 us | 3.25 us | 4.66 us |
// * Warnings *
Environment
Summary -> Benchmark was executed with attached debugger
*/
private const int Size = 1080 * 768;
private static readonly byte[] _image = ReadImage();
private readonly byte[] _input = new byte[Size * 3];
private readonly byte[] _output = new byte[Size];
private static byte[] ReadImage()
{
byte[] ret = new byte[1080 * 768 * 3];
var image = new Bitmap(@"test.png");
for (int y = 0; y < 768; ++y)
{
for (int x = 0; x < 1080; ++x)
{
var w = ret.AsSpan().Slice(3 * (y * 1080 + x));
var col = image.GetPixel(x, y);
w[0] = col.R;
w[1] = col.G;
w[2] = col.B;
}
}
return ret;
}
public Program()
{
//Random.Shared.NextBytes(_input);
_input = _image.ToArray();
}
private void Write()
{
var image = new Bitmap(1080, 768, System.Drawing.Imaging.PixelFormat.Format24bppRgb);
for (int y = 0; y < 768; ++y)
{
for (int x = 0; x < 1080; ++x)
{
var w = _output[y * 1080 + x];
Color c = Color.FromArgb(w, w, w);
image.SetPixel(x, y, c);
}
}
image.Save(@"test_output.png");
}
public float R { get; set; } = 0.3f;
public float G { get; set; } = 0.4f;
public float B { get; set; } = 0.3f;
[Benchmark]
public void Run_Naive()
{
for (int i = 0; i < Size; ++i)
{
var r = (_input[i * 3 + 0] * (int)(256 * 0.7f)) >> 8;
var g = (_input[i * 3 + 1] * (int)(256 * 0.2f)) >> 8;
var b = (_input[i * 3 + 2] * (int)(256 * 0.1f)) >> 8;
_output[i] = (byte)(r + g + b);
}
}
[Benchmark]
public void Run_128()
{
ref uint input = ref Unsafe.As<byte, uint>(ref _input[0]);
ref uint output = ref Unsafe.As<byte, uint>(ref _output[0]);
for (int i = 0; i < Size / 16; ++i)
{
//about hadd:
//0--- -1-- --2- ---3 + 4--- -5-- --6- ---7 = 01-- --23 45-- --67
//01-- --23 45-- --67 + ... = 0123 4567 ....
var read0 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12));
var write0 = Conv(read0);
var read1 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 4));
var write1 = Conv(Ssse3.AlignRight(read1, read0, 12));
var sum01 = Ssse3.HorizontalAdd(Vector128.AsInt16(write0), Vector128.AsInt16(write1));
var read2 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 8));
var write2 = Conv(Ssse3.AlignRight(read2, read1, 8));
var write3 = Conv(Ssse3.AlignRight(default, read2, 4));
var sum23 = Ssse3.HorizontalAdd(Vector128.AsInt16(write2), Vector128.AsInt16(write3));
var sum0123 = Ssse3.HorizontalAdd(sum01, sum23);
Vector128.StoreUnsafe(Vector128.AsUInt32(sum0123), ref Unsafe.Add(ref output, i * 4));
}
}
[Benchmark]
public void Run_256()
{
ref uint input = ref Unsafe.As<byte, uint>(ref _input[0]);
ref uint output = ref Unsafe.As<byte, uint>(ref _output[0]);
ushort r_scale_const = (ushort)(65536 * R);
ushort g_scale_const = (ushort)(65536 * G);
ushort b_scale_const = (ushort)(65536 * B);
var scale_vec_r = Vector256.Create(
r_scale_const, r_scale_const, r_scale_const, r_scale_const,
r_scale_const, r_scale_const, r_scale_const, r_scale_const,
r_scale_const, r_scale_const, r_scale_const, r_scale_const,
r_scale_const, r_scale_const, r_scale_const, r_scale_const);
var scale_vec_g = Vector256.Create(
g_scale_const, g_scale_const, g_scale_const, g_scale_const,
g_scale_const, g_scale_const, g_scale_const, g_scale_const,
g_scale_const, g_scale_const, g_scale_const, g_scale_const,
g_scale_const, g_scale_const, g_scale_const, g_scale_const);
var scale_vec_b = Vector256.Create(
b_scale_const, b_scale_const, b_scale_const, b_scale_const,
b_scale_const, b_scale_const, b_scale_const, b_scale_const,
b_scale_const, b_scale_const, b_scale_const, b_scale_const,
b_scale_const, b_scale_const, b_scale_const, b_scale_const);
for (int i = 0; i < Size / 16; ++i)
{
var read0 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12));
var read1 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 4));
var aligned0 = read0;
var aligned1 = Ssse3.AlignRight(read1, read0, 12);
var read2 = Vector128.LoadUnsafe(ref Unsafe.Add(ref input, i * 12 + 8));
var aligned2 = Ssse3.AlignRight(read2, read1, 8);
var write02 = Conv(aligned0, aligned2, scale_vec_r, scale_vec_g, scale_vec_b);
var aligned3 = Ssse3.AlignRight(default, read2, 4);
var write13 = Conv(aligned1, aligned3, scale_vec_r, scale_vec_g, scale_vec_b);
var hadd0123 = Avx2.HorizontalAdd(Vector256.AsInt16(write02), Vector256.AsInt16(write13));
var result = Ssse3.HorizontalAdd(hadd0123.GetLower(), hadd0123.GetUpper());
Vector128.StoreUnsafe(Vector128.AsUInt32(result), ref Unsafe.Add(ref output, i * 4));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<uint> Conv(Vector128<uint> rgb_p3)
{
const ushort r_scale_const = (ushort)(65536 * 0.3f);
const ushort g_scale_const = (ushort)(65536 * 0.4f);
const ushort b_scale_const = (ushort)(65536 * 0.3f);
Vector128<uint> sum;
{
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10010000));
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00));
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(0u, 24u, 16u, 8u));
var scale_vec = Vector128.Create(
r_scale_const, r_scale_const, r_scale_const, r_scale_const,
r_scale_const, r_scale_const, r_scale_const, r_scale_const);
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec));
sum = scaled;
}
{
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10010100));
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000));
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(8u, 0u, 24u, 16u));
var scale_vec = Vector128.Create(
g_scale_const, g_scale_const, g_scale_const, g_scale_const,
g_scale_const, g_scale_const, g_scale_const, g_scale_const);
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec));
sum += scaled;
}
{
var perm = Vector128.AsUInt32(Avx.Permute(Vector128.AsSingle(rgb_p3), 0b10100100));
var masked = Vector128.BitwiseAnd(perm, Vector128.Create(0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000));
var shifted = Avx2.ShiftRightLogicalVariable(masked, Vector128.Create(16u, 8u, 0u, 24u));
var scale_vec = Vector128.Create(
b_scale_const, b_scale_const, b_scale_const, b_scale_const,
b_scale_const, b_scale_const, b_scale_const, b_scale_const);
var scaled = Vector128.AsUInt32(Sse2.MultiplyHigh(Vector128.AsUInt16(shifted), scale_vec));
sum += scaled;
}
//TODO possibly saturation
return Avx2.ShiftLeftLogicalVariable(sum, Vector128.Create(0u, 8u, 16u, 24u));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<uint> Conv(Vector128<uint> input1, Vector128<uint> input2,
Vector256<ushort> rscale, Vector256<ushort> gscale, Vector256<ushort> bscale)
{
var input = Vector256.Create(input1, input2);
Vector256<uint> sum;
{
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10010000));
var masked = Vector256.BitwiseAnd(perm,
Vector256.Create(0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00));
var shifted = Avx2.ShiftRightLogicalVariable(masked,
Vector256.Create(0u, 24u, 16u, 8u, 0u, 24u, 16u, 8u));
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), rscale));
sum = scaled;
}
{
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10010100));
var masked = Vector256.BitwiseAnd(perm,
Vector256.Create(0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000));
var shifted = Avx2.ShiftRightLogicalVariable(masked,
Vector256.Create(8u, 0u, 24u, 16u, 8u, 0u, 24u, 16u));
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), gscale));
sum += scaled;
}
{
var perm = Vector256.AsUInt32(Avx.Permute(Vector256.AsSingle(input), 0b10100100));
var masked = Vector256.BitwiseAnd(perm,
Vector256.Create(0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000));
var shifted = Avx2.ShiftRightLogicalVariable(masked,
Vector256.Create(16u, 8u, 0u, 24u, 16u, 8u, 0u, 24u));
var scaled = Vector256.AsUInt32(Avx2.MultiplyHigh(Vector256.AsUInt16(shifted), bscale));
sum += scaled;
}
//TODO possibly saturation
return Avx2.ShiftLeftLogicalVariable(sum, Vector256.Create(0u, 8u, 16u, 24u, 0u, 8u, 16u, 24u));
}
static void Main()
{
//var p = new Program();
//p.Run_256();
//p.Write();
BenchmarkRunner.Run<Program>();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment