0x1F9F1/blend.cpp

## blend.cpp
#include <stdint.h>

uint8_t blend_classic(uint8_t sC, uint8_t dC, uint8_t sA)
{
    // The blend equation as specified in SDL_BLENDMODE_BLEND, which uses 2 multiplies and a divide
    return ((sC * sA) + (dC * (255 - sA))) / 255;
}

uint8_t blend_sdl(uint8_t sC, uint8_t dC, uint8_t sA)
{
    // The SDL blend equation, which avoids one multiply.
    // It is correct mathematically, but the dividend can be negative,
    // which causes minor rounding issues
    return ((((int)(sC - dC) * (int)sA) / 255) + dC);
}

uint8_t blend_mul1(uint8_t sC, uint8_t dC, uint8_t sA)
{
    // blend_classic, but manually doing the fixed-point division.
    // Compilers don't seem to be good at picking the smallest factor when optimising division.
    // In this case, it's generally benefical to do so, because:
    // 0x8081 is only three bits, and so is equivalent to `x + (x << 7) + (x << 15)`.
    // The compiler can avoid doing the multiply entirely if it would be faster.
    //
    // It works great with 16-bit SIMD:
    // _mm_srli_epi16(_mm_mulhi_epu16(colors, _mm_set1_epi16(-0x7F7F)), 7)
    return ((uint32_t)((sC * sA) + (dC * (255 - sA))) * 0x8081u) >> 23;
}

uint8_t blend_mul2(uint8_t sC, uint8_t dC, uint8_t sA)
{
    // Same as blend_mul1, but rearranged to allow converting a multiply by 255
    // into a shift-and-sub instead. IMO the best overall choice for scalar code.
    // This involves negative numbers before the division, but that doesn't matter.
    return ((uint32_t)(((sC - dC) * sA) + ((dC << 8) - dC)) * 0x8081u) >> 23;
}

uint8_t blend_paper(uint8_t sC, uint8_t dC, uint8_t sA)
{
    // Same as blend_mul2, but the division is replaced with the one from
    // https://arxiv.org/pdf/2202.02864.pdf
    uint16_t x = ((sC - dC) * sA) + ((dC << 8) - dC);
    x += 0x1U; // Use 0x80 to round instead of floor
    x += x >> 8;
    return x >> 8;
}

#include <immintrin.h>

__m128i __attribute__((target("ssse3"))) blend_simd(
  __m128i src, __m128i dst,
  __m128i src_shuffle, __m128i alpha_shuffle, __m128i alpha_mask)
{
    // SIMD implementation of blend_mul2.
    // dstRGB                            = (srcRGB * srcA) + (dstRGB * (1-srcA))
    // dstA   = srcA + (dstA * (1-srcA)) = (1      * srcA) + (dstA   * (1-srcA))

    // Splat the alpha into all channels for each pixel
    __m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);

    // Convert src to dst format
    src = _mm_shuffle_epi8(src, src_shuffle);

    // Set the alpha channels of src to 255
    src = _mm_or_si128(src, alpha_mask);

    __m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
    __m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());

    __m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    __m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

    __m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
    __m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());

    // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
    dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
    dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));

#if 1
    // blend_paper
    dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
    dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));

    dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
    dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
#else
    // blend_mul2
    dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7);
    dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7);
#endif

    dst = _mm_packus_epi16(dst_lo, dst_hi);

    return dst;
}

// blend_paper, but doing two values at a time
uint32_t blend_scalar32(uint32_t src, uint32_t dst)
{
    uint32_t srcA = src >> 24;
    src |= 0xFF000000;

    uint32_t srcRB = src & 0x00FF00FF;
    uint32_t dstRB = dst & 0x00FF00FF;

    uint32_t srcGA = (src >> 8) & 0x00FF00FF;
    uint32_t dstGA = (dst >> 8) & 0x00FF00FF;

    uint32_t resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
    resRB += 0x00010001; // Use 0x00800080 to round instead of floor
    resRB += (resRB >> 8) & 0x00FF00FF;
    resRB = (resRB >> 8) & 0x00FF00FF;

    uint32_t resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
    resGA += 0x00010001; // Use 0x00800080 to round instead of floor
    resGA += (resGA >> 8) & 0x00FF00FF;
    resGA &= 0xFF00FF00;

    return resRB | resGA;
}

// blend_paper, but doing four values at a time
uint32_t blend_scalar64(uint32_t src, uint32_t dst)
{
    uint64_t srcA = src >> 24;
    src |= 0xFF000000;

    uint64_t srcRBGA = src;
    srcRBGA = (srcRBGA | (srcRBGA << 24)) & 0x00FF00FF00FF00FF;

    uint64_t dstRBGA = dst;
    dstRBGA = (dstRBGA | (dstRBGA << 24)) & 0x00FF00FF00FF00FF;

    uint64_t resRBGA = ((srcRBGA - dstRBGA) * srcA) + (dstRBGA << 8) - dstRBGA;
    resRBGA += 0x0001000100010001; // Use 0x00800080 to round instead of floor
    resRBGA += (resRBGA >> 8) & 0x00FF00FF00FF00FF;

    resRBGA &= 0xFF00FF00FF00FF00;
    resRBGA = (resRBGA >> 8) | (resRBGA >> 32);

    return (uint32_t)resRBGA;
}
	#include <stdint.h>

	uint8_t blend_classic(uint8_t sC, uint8_t dC, uint8_t sA)
	{
	// The blend equation as specified in SDL_BLENDMODE_BLEND, which uses 2 multiplies and a divide
	return ((sC * sA) + (dC * (255 - sA))) / 255;
	}

	uint8_t blend_sdl(uint8_t sC, uint8_t dC, uint8_t sA)
	{
	// The SDL blend equation, which avoids one multiply.
	// It is correct mathematically, but the dividend can be negative,
	// which causes minor rounding issues
	return ((((int)(sC - dC) * (int)sA) / 255) + dC);
	}

	uint8_t blend_mul1(uint8_t sC, uint8_t dC, uint8_t sA)
	{
	// blend_classic, but manually doing the fixed-point division.
	// Compilers don't seem to be good at picking the smallest factor when optimising division.
	// In this case, it's generally benefical to do so, because:
	// 0x8081 is only three bits, and so is equivalent to `x + (x << 7) + (x << 15)`.
	// The compiler can avoid doing the multiply entirely if it would be faster.
	//
	// It works great with 16-bit SIMD:
	// _mm_srli_epi16(_mm_mulhi_epu16(colors, _mm_set1_epi16(-0x7F7F)), 7)
	return ((uint32_t)((sC * sA) + (dC * (255 - sA))) * 0x8081u) >> 23;
	}

	uint8_t blend_mul2(uint8_t sC, uint8_t dC, uint8_t sA)
	{
	// Same as blend_mul1, but rearranged to allow converting a multiply by 255
	// into a shift-and-sub instead. IMO the best overall choice for scalar code.
	// This involves negative numbers before the division, but that doesn't matter.
	return ((uint32_t)(((sC - dC) * sA) + ((dC << 8) - dC)) * 0x8081u) >> 23;
	}

	uint8_t blend_paper(uint8_t sC, uint8_t dC, uint8_t sA)
	{
	// Same as blend_mul2, but the division is replaced with the one from
	// https://arxiv.org/pdf/2202.02864.pdf
	uint16_t x = ((sC - dC) * sA) + ((dC << 8) - dC);
	x += 0x1U; // Use 0x80 to round instead of floor
	x += x >> 8;
	return x >> 8;
	}

	#include <immintrin.h>

	__m128i __attribute__((target("ssse3"))) blend_simd(
	__m128i src, __m128i dst,
	__m128i src_shuffle, __m128i alpha_shuffle, __m128i alpha_mask)
	{
	// SIMD implementation of blend_mul2.
	// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
	// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))

	// Splat the alpha into all channels for each pixel
	__m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);

	// Convert src to dst format
	src = _mm_shuffle_epi8(src, src_shuffle);

	// Set the alpha channels of src to 255
	src = _mm_or_si128(src, alpha_mask);

	__m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
	__m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());

	__m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
	__m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

	__m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
	__m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());

	// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
	dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
	dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));

	#if 1
	// blend_paper
	dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
	dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));

	dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
	dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
	#else
	// blend_mul2
	dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7);
	dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7);
	#endif

	dst = _mm_packus_epi16(dst_lo, dst_hi);

	return dst;
	}

	// blend_paper, but doing two values at a time
	uint32_t blend_scalar32(uint32_t src, uint32_t dst)
	{
	uint32_t srcA = src >> 24;
	src \|= 0xFF000000;

	uint32_t srcRB = src & 0x00FF00FF;
	uint32_t dstRB = dst & 0x00FF00FF;

	uint32_t srcGA = (src >> 8) & 0x00FF00FF;
	uint32_t dstGA = (dst >> 8) & 0x00FF00FF;

	uint32_t resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
	resRB += 0x00010001; // Use 0x00800080 to round instead of floor
	resRB += (resRB >> 8) & 0x00FF00FF;
	resRB = (resRB >> 8) & 0x00FF00FF;

	uint32_t resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
	resGA += 0x00010001; // Use 0x00800080 to round instead of floor
	resGA += (resGA >> 8) & 0x00FF00FF;
	resGA &= 0xFF00FF00;

	return resRB \| resGA;
	}

	// blend_paper, but doing four values at a time
	uint32_t blend_scalar64(uint32_t src, uint32_t dst)
	{
	uint64_t srcA = src >> 24;
	src \|= 0xFF000000;

	uint64_t srcRBGA = src;
	srcRBGA = (srcRBGA \| (srcRBGA << 24)) & 0x00FF00FF00FF00FF;

	uint64_t dstRBGA = dst;
	dstRBGA = (dstRBGA \| (dstRBGA << 24)) & 0x00FF00FF00FF00FF;

	uint64_t resRBGA = ((srcRBGA - dstRBGA) * srcA) + (dstRBGA << 8) - dstRBGA;
	resRBGA += 0x0001000100010001; // Use 0x00800080 to round instead of floor
	resRBGA += (resRBGA >> 8) & 0x00FF00FF00FF00FF;

	resRBGA &= 0xFF00FF00FF00FF00;
	resRBGA = (resRBGA >> 8) \| (resRBGA >> 32);

	return (uint32_t)resRBGA;
	}