Skip to content

Instantly share code, notes, and snippets.

@jdryg
Last active July 9, 2022 07:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdryg/67031564961dbf2b1465950e1e33fe7e to your computer and use it in GitHub Desktop.
Save jdryg/67031564961dbf2b1465950e1e33fe7e to your computer and use it in GitHub Desktop.
NV12 to RGBA SSE2
#define COLOR32_RED_SHIFT 0
#define COLOR32_GREEN_SHIFT 8
#define COLOR32_BLUE_SHIFT 16
#define COLOR32_ALPHA_SHIFT 24
#define COLOR32(r, g, b, a) (uint32_t)(((uint32_t)(r) << COLOR32_RED_SHIFT) | ((uint32_t)(g) << COLOR32_GREEN_SHIFT) | ((uint32_t)(b) << COLOR32_BLUE_SHIFT) | ((uint32_t)(a) << COLOR32_ALPHA_SHIFT))
#if CAMERA_CONFIG_USE_SIMD
static inline __m128i _mm_mullo_epi32(const __m128i& a, const __m128i& b)
{
__m128i tmp1 = _mm_mul_epu32(a, b);
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
}
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8))
// Clamps 32-bit integers in a to [0, 255] range using unsigned saturation
// dst[0..3] = Saturate8(a[0..3])
static inline __m128i _mm_saturate8_epi32(const __m128i& a)
{
__m128i tmp = _mm_packs_epi32(a, a);
tmp = _mm_packus_epi16(tmp, tmp);
tmp = _mm_unpacklo_epi8(tmp, _mm_setzero_si128());
return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
}
static void TransformImage_NV12(BYTE* pDst, LONG dstStride, const BYTE* pSrc, LONG srcStride, DWORD dwWidthInPixels, DWORD dwHeightInPixels)
{
const uint8_t* lpBitsY = pSrc;
const uint8_t* lpBitsCbCr = lpBitsY + (dwHeightInPixels * srcStride);
for (UINT y = 0; y < dwHeightInPixels; y += 2) {
const uint8_t* row0_Y = lpBitsY;
const uint8_t* row1_Y = lpBitsY + srcStride;
const uint8_t* row01_CbCr = lpBitsCbCr;
uint32_t* dstRow0_RGBA = (uint32_t*)pDst;
uint32_t* dstRow1_RGBA = (uint32_t*)(pDst + dstStride);
const uint32_t numIter = dwWidthInPixels >> 4; // 16 pixels per iteration
for (uint32_t i = 0; i < numIter; ++i) {
const __m128i xmm_zero = _mm_setzero_si128();
__m128i xmm_CbCr_u8_0 = _mm_loadu_si128((const __m128i*)row01_CbCr); // (uint8_t){ Cb0, Cr0, Cb1, Cr1, Cb2, Cr2, Cb3, Cr3, Cb4, Cr4, Cb5, Cr5, Cb6, Cr6, Cb7, Cr7 }
// Unpack bytes into words
__m128i xmm_CbCr_u16_0 = _mm_unpacklo_epi8(xmm_CbCr_u8_0, xmm_zero); // (uint16_t){ Cb0, Cr0, Cb1, Cr1, Cb2, Cr2, Cb3, Cr3 }
__m128i xmm_CbCr_u16_1 = _mm_unpackhi_epi8(xmm_CbCr_u8_0, xmm_zero); // (uint16_t){ Cb4, Cr4, Cb5, Cr5, Cb6, Cr6, Cb7, Cr7 }
// Unpack words into dwords
__m128i xmm_CbCr_u32_0 = _mm_unpacklo_epi16(xmm_CbCr_u16_0, xmm_zero); // (uint32_t){ Cb0, Cr0, Cb1, Cr1 }
__m128i xmm_CbCr_u32_1 = _mm_unpackhi_epi16(xmm_CbCr_u16_0, xmm_zero); // (uint32_t){ Cb2, Cr2, Cb3, Cr3 }
__m128i xmm_CbCr_u32_2 = _mm_unpacklo_epi16(xmm_CbCr_u16_1, xmm_zero); // (uint32_t){ Cb4, Cr4, Cb5, Cr5 }
__m128i xmm_CbCr_u32_3 = _mm_unpackhi_epi16(xmm_CbCr_u16_1, xmm_zero); // (uint32_t){ Cb6, Cr6, Cb7, Cr7 }
// Shuffle from { Cb, Cr, Cb, Cr } to { Cb, Cb, Cb, Cb } + { Cr, Cr, Cr, Cr }
__m128i xmm_Cb_u32_0 = _mm_shuffle_si128(xmm_CbCr_u32_0, xmm_CbCr_u32_1, _MM_SHUFFLE(2, 0, 2, 0)); // (uint32_t){ Cb0, Cb1, Cb2, Cb3 }
__m128i xmm_Cr_u32_0 = _mm_shuffle_si128(xmm_CbCr_u32_0, xmm_CbCr_u32_1, _MM_SHUFFLE(3, 1, 3, 1)); // (uint32_t){ Cr0, Cr1, Cr2, Cr3 }
__m128i xmm_Cb_u32_1 = _mm_shuffle_si128(xmm_CbCr_u32_2, xmm_CbCr_u32_3, _MM_SHUFFLE(2, 0, 2, 0)); // (uint32_t){ Cb4, Cb5, Cb6, Cb7 }
__m128i xmm_Cr_u32_1 = _mm_shuffle_si128(xmm_CbCr_u32_2, xmm_CbCr_u32_3, _MM_SHUFFLE(3, 1, 3, 1)); // (uint32_t){ Cr4, Cr5, Cr6, Cr7 }
// Calculate RGB shifts
// r_shift = -52224 + 0 * Cb + 409 * Cr
// g_shift = 39552 - 100 * Cb - 208 * Cr
// b_shift = -65920 + 516 * Cb + 0 * Cr
const __m128i xmm_rshift_off = _mm_set1_epi32(-52224);
const __m128i xmm_rshift_cr = _mm_set1_epi32(409);
__m128i xmm_rshift_0 = _mm_add_epi32(xmm_rshift_off, _mm_mullo_epi32(xmm_rshift_cr, xmm_Cr_u32_0)); // (uint32_t){ r_shift0, r_shift1, r_shift2, r_shift3 }
__m128i xmm_rshift_1 = _mm_add_epi32(xmm_rshift_off, _mm_mullo_epi32(xmm_rshift_cr, xmm_Cr_u32_1)); // (uint32_t){ r_shift4, r_shift5, r_shift6, r_shift7 }
const __m128i xmm_gshift_off = _mm_set1_epi32(39552);
const __m128i xmm_gshift_cb = _mm_set1_epi32(-100);
const __m128i xmm_gshift_cr = _mm_set1_epi32(-208);
__m128i xmm_gshift_0 = _mm_add_epi32(xmm_gshift_off, _mm_add_epi32(_mm_mullo_epi32(xmm_gshift_cb, xmm_Cb_u32_0), _mm_mullo_epi32(xmm_gshift_cr, xmm_Cr_u32_0))); // (uint32_t){ g_shift0, g_shift1, g_shift2, g_shift3 }
__m128i xmm_gshift_1 = _mm_add_epi32(xmm_gshift_off, _mm_add_epi32(_mm_mullo_epi32(xmm_gshift_cb, xmm_Cb_u32_1), _mm_mullo_epi32(xmm_gshift_cr, xmm_Cr_u32_1))); // (uint32_t){ g_shift4, g_shift5, g_shift6, g_shift7 }
const __m128i xmm_bshift_off = _mm_set1_epi32(-65920);
const __m128i xmm_bshift_cb = _mm_set1_epi32(516);
__m128i xmm_bshift_0 = _mm_add_epi32(xmm_bshift_off, _mm_mullo_epi32(xmm_bshift_cb, xmm_Cb_u32_0)); // (uint32_t){ b_shift0, b_shift1, b_shift2, b_shift3 }
__m128i xmm_bshift_1 = _mm_add_epi32(xmm_bshift_off, _mm_mullo_epi32(xmm_bshift_cb, xmm_Cb_u32_1)); // (uint32_t){ b_shift4, b_shift5, b_shift6, b_shift7 }
// Row #0
{
// Load Y values
__m128i xmm_Y_u8 = _mm_loadu_si128((const __m128i*)row0_Y); // (uint8_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 }
// Unpack bytes into words
__m128i xmm_Y_u16_0 = _mm_unpacklo_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 }
__m128i xmm_Y_u16_1 = _mm_unpackhi_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 }
// Unpack words into dwords
__m128i xmm_Y_u32_0 = _mm_unpacklo_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y0, Y1, Y2, Y3 }
__m128i xmm_Y_u32_1 = _mm_unpackhi_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y4, Y5, Y6, Y7 }
__m128i xmm_Y_u32_2 = _mm_unpacklo_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y8, Y9, Y10, Y11 }
__m128i xmm_Y_u32_3 = _mm_unpackhi_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y12, Y13, Y14, Y15 }
// Calculate base color (only Y-dependent)
// base = -4768 + 298 * Y
const __m128i xmm_base_off = _mm_set1_epi32(-4768);
const __m128i xmm_base_Y = _mm_set1_epi32(298);
__m128i xmm_base_0 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_0)); // (uint32_t){ base0, base1, base2, base3 }
__m128i xmm_base_1 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_1)); // (uint32_t){ base4, base5, base6, base7 }
__m128i xmm_base_2 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_2)); // (uint32_t){ base8, base9, base10, base11 }
__m128i xmm_base_3 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_3)); // (uint32_t){ base12, base13, base14, base15 }
// Calculate R = (base + r_shift) >> 8
__m128i xmm_r_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r0, r1, r2, r3 }
__m128i xmm_r_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r4, r5, r6, r7 }
__m128i xmm_r_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r8, r9, r10, r11 }
__m128i xmm_r_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r12, r13, r14, r15 }
// Calculate G = (base + g_shift) >> 8
__m128i xmm_g_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g0, g1, g2, g3 }
__m128i xmm_g_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g4, g5, g6, g7 }
__m128i xmm_g_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g8, g9, g10, g11 }
__m128i xmm_g_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g12, g13, g14, g15 }
// Calculate B = (base + b_shift) >> 8
__m128i xmm_b_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b0, b1, b2, b3 }
__m128i xmm_b_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b4, b5, b6, b7 }
__m128i xmm_b_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b8, b9, b10, b11 }
__m128i xmm_b_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b12, b13, b14, b15 }
// Clamp RGB results to [0, 255]
xmm_r_0 = _mm_saturate8_epi32(xmm_r_0);
xmm_r_1 = _mm_saturate8_epi32(xmm_r_1);
xmm_r_2 = _mm_saturate8_epi32(xmm_r_2);
xmm_r_3 = _mm_saturate8_epi32(xmm_r_3);
xmm_g_0 = _mm_saturate8_epi32(xmm_g_0);
xmm_g_1 = _mm_saturate8_epi32(xmm_g_1);
xmm_g_2 = _mm_saturate8_epi32(xmm_g_2);
xmm_g_3 = _mm_saturate8_epi32(xmm_g_3);
xmm_b_0 = _mm_saturate8_epi32(xmm_b_0);
xmm_b_1 = _mm_saturate8_epi32(xmm_b_1);
xmm_b_2 = _mm_saturate8_epi32(xmm_b_2);
xmm_b_3 = _mm_saturate8_epi32(xmm_b_3);
// Merge R,G,B values into RGBA results
const __m128i xmm_a = _mm_slli_epi32(_mm_set1_epi32(255), COLOR32_ALPHA_SHIFT);
__m128i xmm_rgba_0 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_0, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_0, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_0, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_1 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_1, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_1, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_1, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_2 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_2, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_2, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_2, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_3 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_3, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_3, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_3, COLOR32_BLUE_SHIFT), xmm_a)
);
// Store results
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 0), xmm_rgba_0);
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 4), xmm_rgba_1);
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 8), xmm_rgba_2);
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 12), xmm_rgba_3);
dstRow0_RGBA += 16;
row0_Y += 16;
}
// Row #1
{
// Load Y values
__m128i xmm_Y_u8 = _mm_loadu_si128((const __m128i*)row1_Y); // (uint8_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 }
// Unpack bytes into words
__m128i xmm_Y_u16_0 = _mm_unpacklo_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 }
__m128i xmm_Y_u16_1 = _mm_unpackhi_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 }
// Unpack words into dwords
__m128i xmm_Y_u32_0 = _mm_unpacklo_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y0, Y1, Y2, Y3 }
__m128i xmm_Y_u32_1 = _mm_unpackhi_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y4, Y5, Y6, Y7 }
__m128i xmm_Y_u32_2 = _mm_unpacklo_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y8, Y9, Y10, Y11 }
__m128i xmm_Y_u32_3 = _mm_unpackhi_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y12, Y13, Y14, Y15 }
// Calculate base color (only Y-dependent)
// base = -4768 + 298 * Y
const __m128i xmm_base_off = _mm_set1_epi32(-4768);
const __m128i xmm_base_Y = _mm_set1_epi32(298);
__m128i xmm_base_0 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_0)); // (uint32_t){ base0, base1, base2, base3 }
__m128i xmm_base_1 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_1)); // (uint32_t){ base4, base5, base6, base7 }
__m128i xmm_base_2 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_2)); // (uint32_t){ base8, base9, base10, base11 }
__m128i xmm_base_3 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_3)); // (uint32_t){ base12, base13, base14, base15 }
// Calculate R = (base + r_shift) >> 8
__m128i xmm_r_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r0, r1, r2, r3 }
__m128i xmm_r_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r4, r5, r6, r7 }
__m128i xmm_r_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r8, r9, r10, r11 }
__m128i xmm_r_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r12, r13, r14, r15 }
// Calculate G = (base + g_shift) >> 8
__m128i xmm_g_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g0, g1, g2, g3 }
__m128i xmm_g_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g4, g5, g6, g7 }
__m128i xmm_g_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g8, g9, g10, g11 }
__m128i xmm_g_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g12, g13, g14, g15 }
// Calculate B = (base + b_shift) >> 8
__m128i xmm_b_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b0, b1, b2, b3 }
__m128i xmm_b_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b4, b5, b6, b7 }
__m128i xmm_b_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b8, b9, b10, b11 }
__m128i xmm_b_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b12, b13, b14, b15 }
// Clamp RGB results to [0, 255]
xmm_r_0 = _mm_saturate8_epi32(xmm_r_0);
xmm_r_1 = _mm_saturate8_epi32(xmm_r_1);
xmm_r_2 = _mm_saturate8_epi32(xmm_r_2);
xmm_r_3 = _mm_saturate8_epi32(xmm_r_3);
xmm_g_0 = _mm_saturate8_epi32(xmm_g_0);
xmm_g_1 = _mm_saturate8_epi32(xmm_g_1);
xmm_g_2 = _mm_saturate8_epi32(xmm_g_2);
xmm_g_3 = _mm_saturate8_epi32(xmm_g_3);
xmm_b_0 = _mm_saturate8_epi32(xmm_b_0);
xmm_b_1 = _mm_saturate8_epi32(xmm_b_1);
xmm_b_2 = _mm_saturate8_epi32(xmm_b_2);
xmm_b_3 = _mm_saturate8_epi32(xmm_b_3);
// Merge R,G,B values into RGBA results
const __m128i xmm_a = _mm_slli_epi32(_mm_set1_epi32(255), COLOR32_ALPHA_SHIFT);
__m128i xmm_rgba_0 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_0, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_0, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_0, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_1 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_1, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_1, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_1, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_2 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_2, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_2, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_2, COLOR32_BLUE_SHIFT), xmm_a)
);
__m128i xmm_rgba_3 = _mm_or_si128(
_mm_or_si128(_mm_slli_epi32(xmm_r_3, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_3, COLOR32_GREEN_SHIFT)),
_mm_or_si128(_mm_slli_epi32(xmm_b_3, COLOR32_BLUE_SHIFT), xmm_a)
);
// Store results
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 0), xmm_rgba_0);
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 4), xmm_rgba_1);
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 8), xmm_rgba_2);
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 12), xmm_rgba_3);
dstRow1_RGBA += 16;
row1_Y += 16;
}
row01_CbCr += 16;
}
pDst += (2 * dstStride);
lpBitsY += (2 * srcStride);
lpBitsCbCr += srcStride;
}
}
#else // CAMERA_CONFIG_USE_SIMD
static inline uint8_t Clip(int clr)
{
return (uint8_t)(clr < 0 ? 0 : (clr > 255 ? 255 : clr));
}
static inline uint32_t ConvertYCrCbToRGB(int32_t y, int32_t cr, int32_t cb, uint8_t alpha)
{
const int32_t c = y - 16;
const int32_t d = cb - 128;
const int32_t e = cr - 128;
return COLOR32(Clip((298 * c + 409 * e + 128) >> 8), Clip((298 * c - 100 * d - 208 * e + 128) >> 8), Clip((298 * c + 516 * d + 128) >> 8), 255);
}
// https://github.com/pauldotknopf/WindowsSDK7-Samples/blob/master/multimedia/mediafoundation/MFCaptureD3D/device.cpp#L711
static void TransformImage_NV12(BYTE* pDst, LONG dstStride, const BYTE* pSrc, LONG srcStride, DWORD dwWidthInPixels, DWORD dwHeightInPixels)
{
const uint8_t* lpBitsY = pSrc;
const uint8_t* lpBitsCb = lpBitsY + (dwHeightInPixels * srcStride);;
const uint8_t* lpBitsCr = lpBitsCb + 1;
for (UINT y = 0; y < dwHeightInPixels; y += 2) {
const uint8_t* lpLineY1 = lpBitsY;
const uint8_t* lpLineY2 = lpBitsY + srcStride;
const uint8_t* lpLineCr = lpBitsCr;
const uint8_t* lpLineCb = lpBitsCb;
uint32_t* lpDibLine1 = (uint32_t*)pDst;
uint32_t* lpDibLine2 = (uint32_t*)(pDst + dstStride);
for (UINT x = 0; x < dwWidthInPixels; x += 2) {
const int32_t y0 = (int32_t)lpLineY1[0];
const int32_t y1 = (int32_t)lpLineY1[1];
const int32_t y2 = (int32_t)lpLineY2[0];
const int32_t y3 = (int32_t)lpLineY2[1];
const int32_t cb = (int32_t)lpLineCb[0];
const int32_t cr = (int32_t)lpLineCr[0];
lpDibLine1[0] = ConvertYCrCbToRGB(y0, cr, cb, 255);
lpDibLine1[1] = ConvertYCrCbToRGB(y1, cr, cb, 255);
lpDibLine2[0] = ConvertYCrCbToRGB(y2, cr, cb, 255);
lpDibLine2[1] = ConvertYCrCbToRGB(y3, cr, cb, 255);
lpLineY1 += 2;
lpLineY2 += 2;
lpLineCr += 2;
lpLineCb += 2;
lpDibLine1 += 2;
lpDibLine2 += 2;
}
pDst += (2 * dstStride);
lpBitsY += (2 * srcStride);
lpBitsCr += srcStride;
lpBitsCb += srcStride;
}
}
#endif // CAMERA_CONFIG_USE_SIMD
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment