Skip to content

Instantly share code, notes, and snippets.

@0x1F9F1
Last active September 10, 2023 09:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 0x1F9F1/5ef2f3f822716fd375ef58c733b0251a to your computer and use it in GitHub Desktop.
Save 0x1F9F1/5ef2f3f822716fd375ef58c733b0251a to your computer and use it in GitHub Desktop.
Thoughts on SDL_BLENDMODE_BLEND
#include <stdint.h>
uint8_t blend_classic(uint8_t sC, uint8_t dC, uint8_t sA)
{
// The blend equation as specified in SDL_BLENDMODE_BLEND, which uses 2 multiplies and a divide
return ((sC * sA) + (dC * (255 - sA))) / 255;
}
uint8_t blend_sdl(uint8_t sC, uint8_t dC, uint8_t sA)
{
// The SDL blend equation, which avoids one multiply.
// It is correct mathematically, but the dividend can be negative,
// which causes minor rounding issues
return ((((int)(sC - dC) * (int)sA) / 255) + dC);
}
uint8_t blend_mul1(uint8_t sC, uint8_t dC, uint8_t sA)
{
// blend_classic, but manually doing the fixed-point division.
// Compilers don't seem to be good at picking the smallest factor when optimising division.
// In this case, it's generally benefical to do so, because:
// 0x8081 is only three bits, and so is equivalent to `x + (x << 7) + (x << 15)`.
// The compiler can avoid doing the multiply entirely if it would be faster.
//
// It works great with 16-bit SIMD:
// _mm_srli_epi16(_mm_mulhi_epu16(colors, _mm_set1_epi16(-0x7F7F)), 7)
return ((uint32_t)((sC * sA) + (dC * (255 - sA))) * 0x8081u) >> 23;
}
uint8_t blend_mul2(uint8_t sC, uint8_t dC, uint8_t sA)
{
// Same as blend_mul1, but rearranged to allow converting a multiply by 255
// into a shift-and-sub instead. IMO the best overall choice for scalar code.
// This involves negative numbers before the division, but that doesn't matter.
return ((uint32_t)(((sC - dC) * sA) + ((dC << 8) - dC)) * 0x8081u) >> 23;
}
uint8_t blend_paper(uint8_t sC, uint8_t dC, uint8_t sA)
{
// Same as blend_mul2, but the division is replaced with the one from
// https://arxiv.org/pdf/2202.02864.pdf
uint16_t x = ((sC - dC) * sA) + ((dC << 8) - dC);
x += 0x1U; // Use 0x80 to round instead of floor
x += x >> 8;
return x >> 8;
}
#include <immintrin.h>
__m128i __attribute__((target("ssse3"))) blend_simd(
__m128i src, __m128i dst,
__m128i src_shuffle, __m128i alpha_shuffle, __m128i alpha_mask)
{
// SIMD implementation of blend_mul2.
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))
// Splat the alpha into all channels for each pixel
__m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);
// Convert src to dst format
src = _mm_shuffle_epi8(src, src_shuffle);
// Set the alpha channels of src to 255
src = _mm_or_si128(src, alpha_mask);
__m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());
__m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
__m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
__m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());
// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
#if 1
// blend_paper
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
#else
// blend_mul2
dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7);
dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7);
#endif
dst = _mm_packus_epi16(dst_lo, dst_hi);
return dst;
}
// blend_paper, but doing two values at a time
uint32_t blend_scalar32(uint32_t src, uint32_t dst)
{
uint32_t srcA = src >> 24;
src |= 0xFF000000;
uint32_t srcRB = src & 0x00FF00FF;
uint32_t dstRB = dst & 0x00FF00FF;
uint32_t srcGA = (src >> 8) & 0x00FF00FF;
uint32_t dstGA = (dst >> 8) & 0x00FF00FF;
uint32_t resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
resRB += 0x00010001; // Use 0x00800080 to round instead of floor
resRB += (resRB >> 8) & 0x00FF00FF;
resRB = (resRB >> 8) & 0x00FF00FF;
uint32_t resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
resGA += 0x00010001; // Use 0x00800080 to round instead of floor
resGA += (resGA >> 8) & 0x00FF00FF;
resGA &= 0xFF00FF00;
return resRB | resGA;
}
// blend_paper, but doing four values at a time
uint32_t blend_scalar64(uint32_t src, uint32_t dst)
{
uint64_t srcA = src >> 24;
src |= 0xFF000000;
uint64_t srcRBGA = src;
srcRBGA = (srcRBGA | (srcRBGA << 24)) & 0x00FF00FF00FF00FF;
uint64_t dstRBGA = dst;
dstRBGA = (dstRBGA | (dstRBGA << 24)) & 0x00FF00FF00FF00FF;
uint64_t resRBGA = ((srcRBGA - dstRBGA) * srcA) + (dstRBGA << 8) - dstRBGA;
resRBGA += 0x0001000100010001; // Use 0x00800080 to round instead of floor
resRBGA += (resRBGA >> 8) & 0x00FF00FF00FF00FF;
resRBGA &= 0xFF00FF00FF00FF00;
resRBGA = (resRBGA >> 8) | (resRBGA >> 32);
return (uint32_t)resRBGA;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment