Skip to content

Instantly share code, notes, and snippets.

@aronson
Last active September 16, 2023 12:43
Show Gist options
  • Save aronson/07e8aa5b522ac43bf2becb894519941e to your computer and use it in GitHub Desktop.
Save aronson/07e8aa5b522ac43bf2becb894519941e to your computer and use it in GitHub Desktop.
Thoughts on accurate ARM NEON SDL alpha blending blitters
#include <arm_neon.h>
#include <SDL.h>
/**
* NEON operates at up to 128 bit width and supports mostly the same operations BlitNtoNPixelAlpha_SSE4_1 and _AVX2 use
* save for shuffling.
* We have several problems to solve, some easier than others. We need to reorder the input src pixel stream efficiently.
* At 128 bits this is 4-wide, similar to SSE4.1. We can assemble lookup tables that NEON will use to place the data
* appropriately into a new vector, instead of shuffling in-place.
* We need to use a similar approach to extract alpha dynamically to splat it as the extract functions want immediate
* integers AFAICT and switches for that against the SDL_PixelFormat->Ashift struct member is a performance nightmare.
* From there MixRGBA_NEON is nearly the same as the SSE4.1 intrinsic implementation. Missing from this code is the
* required step where alpha is set to 0xFF in all src pixel input and only preserved in the input mask for channel multiply.
*
* Benchmarks indicate a jump from ~20 FPS in the optimized scalar form to over 90 FPS on an M1 Max under the latest macOS
* and Apple Clang using `-O3`.
*/
// vtable fetches alpha from input pixels based on format for splat
uint8x8_t get_alpha_vtable(SDL_PixelFormat* dstfmt) {
// Create a table lookup pattern to extract the alpha values
uint8_t pattern_data[8] = {0, 0, 0, 0, 0, 0, 0, 0};
pattern_data[dstfmt->Ashift / 8] = 1;
return vld1_u8(pattern_data);
}
// uses above vtable to pull alpha into vector before splat
uint8x16_t extract_alpha_values(uint8x16_t input_colors, uint8x8_t pattern) {
// Extract the alpha values using the table lookup pattern
uint8x8_t low_alpha = vtbl1_u8(vget_low_u8(input_colors), pattern);
uint8x8_t high_alpha = vtbl1_u8(vget_high_u8(input_colors), pattern);
uint8x16_t extracted_alpha = vcombine_u8(low_alpha, high_alpha);
return extracted_alpha;
}
// splats alpha for MixRGBA_NEON input
uint8x16_t splat_alpha_to_new_vector(uint8x16_t reordered_colors, uint8x8_t alpha_mask) {
// Extract the alpha values from the reordered colors
uint8x16_t alpha_values = extract_alpha_values(reordered_colors, alpha_mask);
// Duplicate the alpha values across all bytes of each 32-bit integer
uint8x8_t low_alpha = vget_low_u8(alpha_values);
uint8x8_t high_alpha = vget_high_u8(alpha_values);
uint8x16_t splatted_alpha = vcombine_u8(low_alpha, high_alpha);
return splatted_alpha;
}
// Helper
uint8x8x2_t convert_uint8x16_to_uint8x8x2(uint8x16_t input) {
uint8x8x2_t output;
output.val[0] = vget_low_u8(input);
output.val[1] = vget_high_u8(input);
return output;
}
// Produce a vtable defining the color reorder operation
uint8x8_t generate_reorder_vtable(const SDL_PixelFormat *srcfmt, const SDL_PixelFormat *dstfmt) {
uint8_t shuffle_mask[8];
for (int i = 0; i < 2; ++i) {
shuffle_mask[dstfmt->Ashift / 8 + i * 4] = srcfmt->Ashift / 8 + i * 4;
shuffle_mask[dstfmt->Rshift / 8 + i * 4] = srcfmt->Rshift / 8 + i * 4;
shuffle_mask[dstfmt->Gshift / 8 + i * 4] = srcfmt->Gshift / 8 + i * 4;
shuffle_mask[dstfmt->Bshift / 8 + i * 4] = srcfmt->Bshift / 8 + i * 4;
}
return vld1_u8(shuffle_mask);
}
// Reorder the pixels such that the source pixel stream is aligned to a destination format implied by the pattern
uint8x16_t reorder_pixels_argb8888_to_dstfmt(const uint8x16_t src_pixels, const uint8x8_t pattern) {
// Load four src pixels into two 64-bit NEON registers
uint8x8x2_t src_data = convert_uint8x16_to_uint8x8x2(src_pixels);
// Apply the vtable lookup function on the two registers to generate dstfmt ordered pixel data
uint8x8_t low = vtbl1_u8(src_data.val[0], pattern);
uint8x8_t high = vtbl1_u8(src_data.val[1], pattern);
// Pack both results back into a 128-bit NEON register and return it
return vcombine_u8(low, high);
}
// Blend four pixels of 32 ARGB data -- missing alpha saturate step
uint8x16_t MixRGBA_NEON(uint8x16_t sC, uint8x16_t dC, uint8x16_t sA) {
// Calculate (sC - dC) * sA
uint16x8_t diff_lo = vmull_u8(vget_low_u8(sC), vget_low_u8(sA));
uint16x8_t diff_hi = vmull_u8(vget_high_u8(sC), vget_high_u8(sA));
diff_lo = vsubq_u16(diff_lo, vmull_u8(vget_low_u8(dC), vget_low_u8(sA)));
diff_hi = vsubq_u16(diff_hi, vmull_u8(vget_high_u8(dC), vget_high_u8(sA)));
// Calculate (dC << 8) - dC
uint16x8_t dC_lo = vmovl_u8(vget_low_u8(dC));
uint16x8_t dC_hi = vmovl_u8(vget_high_u8(dC));
dC_lo = vsubq_u16(vshlq_n_u16(dC_lo, 8), dC_lo);
dC_hi = vsubq_u16(vshlq_n_u16(dC_hi, 8), dC_hi);
// Add the two results and the constant 0x1
uint16x8_t x_lo = vaddq_u16(vaddq_u16(diff_lo, dC_lo), vdupq_n_u16(0x1));
uint16x8_t x_hi = vaddq_u16(vaddq_u16(diff_hi, dC_hi), vdupq_n_u16(0x1));
// Add x >> 8 to x and then shift the result right by 8 bits
x_lo = vaddq_u16(x_lo, vshrq_n_u16(x_lo, 8));
x_hi = vaddq_u16(x_hi, vshrq_n_u16(x_hi, 8));
x_lo = vshrq_n_u16(x_lo, 8);
x_hi = vshrq_n_u16(x_hi, 8);
// Combine the results and return
return vcombine_u8(vmovn_u16(x_lo), vmovn_u16(x_hi));
}
// Snippet demonstrating plugging this in to BlitNtoNPixelAlpha, very buggy impl at this time
#ifdef SDL_NEON_INTRINSICS
if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasNEON()) {
uint8x8_t shuffle_mask = generate_vtable(srcfmt, dstfmt);
uint8x8_t alpha_mask = get_alpha_mask(dstfmt);
int chunks = width / 4;
while (height--) {
for (int i = 0; i < chunks; i += 1) {
uint8x16_t colors = vld1q_u8(src + i * 16);
colors = reorder_pixels_argb8888_to_dstfmt(colors, shuffle_mask);
uint8x16_t dst_colors = vld1q_u8(dst + i * 16);
uint8x16_t alpha_splat = splat_alpha_to_new_vector(colors, alpha_mask);
uint8x16_t mixed_colors = MixRGBA_NEON(colors, dst_colors, alpha_splat);
vst1q_u8(dst + i * 16, mixed_colors);
}
// Handle remaining pixels when width is not a multiple of 4
if (width % 4 != 0) {
int remaining_pixels = width % 4;
int offset = width - remaining_pixels;
for (int i = offset; i < width; i++) {
DISEMBLE_RGBA(src + i * 4, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
if (sA) {
DISEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
ASSEMBLE_RGBA(dst + i * 4, dstbpp, dstfmt, dR, dG, dB, dA);
}
}
}
src += 4 * width;
dst += 4 * width;
src += srcskip;
dst += dstskip;
}
return;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment