Last active
September 10, 2023 09:49
-
-
Save 0x1F9F1/5ef2f3f822716fd375ef58c733b0251a to your computer and use it in GitHub Desktop.
Thoughts on SDL_BLENDMODE_BLEND
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
uint8_t blend_classic(uint8_t sC, uint8_t dC, uint8_t sA) | |
{ | |
// The blend equation as specified in SDL_BLENDMODE_BLEND, which uses 2 multiplies and a divide | |
return ((sC * sA) + (dC * (255 - sA))) / 255; | |
} | |
uint8_t blend_sdl(uint8_t sC, uint8_t dC, uint8_t sA) | |
{ | |
// The SDL blend equation, which avoids one multiply. | |
// It is correct mathematically, but the dividend can be negative, | |
// which causes minor rounding issues | |
return ((((int)(sC - dC) * (int)sA) / 255) + dC); | |
} | |
uint8_t blend_mul1(uint8_t sC, uint8_t dC, uint8_t sA) | |
{ | |
// blend_classic, but manually doing the fixed-point division. | |
// Compilers don't seem to be good at picking the smallest factor when optimising division. | |
// In this case, it's generally benefical to do so, because: | |
// 0x8081 is only three bits, and so is equivalent to `x + (x << 7) + (x << 15)`. | |
// The compiler can avoid doing the multiply entirely if it would be faster. | |
// | |
// It works great with 16-bit SIMD: | |
// _mm_srli_epi16(_mm_mulhi_epu16(colors, _mm_set1_epi16(-0x7F7F)), 7) | |
return ((uint32_t)((sC * sA) + (dC * (255 - sA))) * 0x8081u) >> 23; | |
} | |
uint8_t blend_mul2(uint8_t sC, uint8_t dC, uint8_t sA) | |
{ | |
// Same as blend_mul1, but rearranged to allow converting a multiply by 255 | |
// into a shift-and-sub instead. IMO the best overall choice for scalar code. | |
// This involves negative numbers before the division, but that doesn't matter. | |
return ((uint32_t)(((sC - dC) * sA) + ((dC << 8) - dC)) * 0x8081u) >> 23; | |
} | |
uint8_t blend_paper(uint8_t sC, uint8_t dC, uint8_t sA) | |
{ | |
// Same as blend_mul2, but the division is replaced with the one from | |
// https://arxiv.org/pdf/2202.02864.pdf | |
uint16_t x = ((sC - dC) * sA) + ((dC << 8) - dC); | |
x += 0x1U; // Use 0x80 to round instead of floor | |
x += x >> 8; | |
return x >> 8; | |
} | |
#include <immintrin.h> | |
__m128i __attribute__((target("ssse3"))) blend_simd( | |
__m128i src, __m128i dst, | |
__m128i src_shuffle, __m128i alpha_shuffle, __m128i alpha_mask) | |
{ | |
// SIMD implementation of blend_mul2. | |
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA)) | |
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA)) | |
// Splat the alpha into all channels for each pixel | |
__m128i srca = _mm_shuffle_epi8(src, alpha_shuffle); | |
// Convert src to dst format | |
src = _mm_shuffle_epi8(src, src_shuffle); | |
// Set the alpha channels of src to 255 | |
src = _mm_or_si128(src, alpha_mask); | |
__m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128()); | |
__m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128()); | |
__m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | |
__m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | |
__m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128()); | |
__m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128()); | |
// dst = ((src - dst) * srcA) + ((dst << 8) - dst) | |
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); | |
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); | |
#if 1 | |
// blend_paper | |
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); | |
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); | |
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); | |
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); | |
#else | |
// blend_mul2 | |
dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7); | |
dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7); | |
#endif | |
dst = _mm_packus_epi16(dst_lo, dst_hi); | |
return dst; | |
} | |
// blend_paper, but doing two values at a time | |
uint32_t blend_scalar32(uint32_t src, uint32_t dst) | |
{ | |
uint32_t srcA = src >> 24; | |
src |= 0xFF000000; | |
uint32_t srcRB = src & 0x00FF00FF; | |
uint32_t dstRB = dst & 0x00FF00FF; | |
uint32_t srcGA = (src >> 8) & 0x00FF00FF; | |
uint32_t dstGA = (dst >> 8) & 0x00FF00FF; | |
uint32_t resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; | |
resRB += 0x00010001; // Use 0x00800080 to round instead of floor | |
resRB += (resRB >> 8) & 0x00FF00FF; | |
resRB = (resRB >> 8) & 0x00FF00FF; | |
uint32_t resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; | |
resGA += 0x00010001; // Use 0x00800080 to round instead of floor | |
resGA += (resGA >> 8) & 0x00FF00FF; | |
resGA &= 0xFF00FF00; | |
return resRB | resGA; | |
} | |
// blend_paper, but doing four values at a time | |
uint32_t blend_scalar64(uint32_t src, uint32_t dst) | |
{ | |
uint64_t srcA = src >> 24; | |
src |= 0xFF000000; | |
uint64_t srcRBGA = src; | |
srcRBGA = (srcRBGA | (srcRBGA << 24)) & 0x00FF00FF00FF00FF; | |
uint64_t dstRBGA = dst; | |
dstRBGA = (dstRBGA | (dstRBGA << 24)) & 0x00FF00FF00FF00FF; | |
uint64_t resRBGA = ((srcRBGA - dstRBGA) * srcA) + (dstRBGA << 8) - dstRBGA; | |
resRBGA += 0x0001000100010001; // Use 0x00800080 to round instead of floor | |
resRBGA += (resRBGA >> 8) & 0x00FF00FF00FF00FF; | |
resRBGA &= 0xFF00FF00FF00FF00; | |
resRBGA = (resRBGA >> 8) | (resRBGA >> 32); | |
return (uint32_t)resRBGA; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment