Last active
July 9, 2022 07:05
-
-
Save jdryg/67031564961dbf2b1465950e1e33fe7e to your computer and use it in GitHub Desktop.
NV12 to RGBA SSE2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define COLOR32_RED_SHIFT 0 | |
#define COLOR32_GREEN_SHIFT 8 | |
#define COLOR32_BLUE_SHIFT 16 | |
#define COLOR32_ALPHA_SHIFT 24 | |
#define COLOR32(r, g, b, a) (uint32_t)(((uint32_t)(r) << COLOR32_RED_SHIFT) | ((uint32_t)(g) << COLOR32_GREEN_SHIFT) | ((uint32_t)(b) << COLOR32_BLUE_SHIFT) | ((uint32_t)(a) << COLOR32_ALPHA_SHIFT)) | |
#if CAMERA_CONFIG_USE_SIMD | |
static inline __m128i _mm_mullo_epi32(const __m128i& a, const __m128i& b) | |
{ | |
__m128i tmp1 = _mm_mul_epu32(a, b); | |
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); | |
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); | |
} | |
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8)) | |
// Clamps 32-bit integers in a to [0, 255] range using unsigned saturation | |
// dst[0..3] = Saturate8(a[0..3]) | |
static inline __m128i _mm_saturate8_epi32(const __m128i& a) | |
{ | |
__m128i tmp = _mm_packs_epi32(a, a); | |
tmp = _mm_packus_epi16(tmp, tmp); | |
tmp = _mm_unpacklo_epi8(tmp, _mm_setzero_si128()); | |
return _mm_unpacklo_epi16(tmp, _mm_setzero_si128()); | |
} | |
static void TransformImage_NV12(BYTE* pDst, LONG dstStride, const BYTE* pSrc, LONG srcStride, DWORD dwWidthInPixels, DWORD dwHeightInPixels) | |
{ | |
const uint8_t* lpBitsY = pSrc; | |
const uint8_t* lpBitsCbCr = lpBitsY + (dwHeightInPixels * srcStride); | |
for (UINT y = 0; y < dwHeightInPixels; y += 2) { | |
const uint8_t* row0_Y = lpBitsY; | |
const uint8_t* row1_Y = lpBitsY + srcStride; | |
const uint8_t* row01_CbCr = lpBitsCbCr; | |
uint32_t* dstRow0_RGBA = (uint32_t*)pDst; | |
uint32_t* dstRow1_RGBA = (uint32_t*)(pDst + dstStride); | |
const uint32_t numIter = dwWidthInPixels >> 4; // 16 pixels per iteration | |
for (uint32_t i = 0; i < numIter; ++i) { | |
const __m128i xmm_zero = _mm_setzero_si128(); | |
__m128i xmm_CbCr_u8_0 = _mm_loadu_si128((const __m128i*)row01_CbCr); // (uint8_t){ Cb0, Cr0, Cb1, Cr1, Cb2, Cr2, Cb3, Cr3, Cb4, Cr4, Cb5, Cr5, Cb6, Cr6, Cb7, Cr7 } | |
// Unpack bytes into words | |
__m128i xmm_CbCr_u16_0 = _mm_unpacklo_epi8(xmm_CbCr_u8_0, xmm_zero); // (uint16_t){ Cb0, Cr0, Cb1, Cr1, Cb2, Cr2, Cb3, Cr3 } | |
__m128i xmm_CbCr_u16_1 = _mm_unpackhi_epi8(xmm_CbCr_u8_0, xmm_zero); // (uint16_t){ Cb4, Cr4, Cb5, Cr5, Cb6, Cr6, Cb7, Cr7 } | |
// Unpack words into dwords | |
__m128i xmm_CbCr_u32_0 = _mm_unpacklo_epi16(xmm_CbCr_u16_0, xmm_zero); // (uint32_t){ Cb0, Cr0, Cb1, Cr1 } | |
__m128i xmm_CbCr_u32_1 = _mm_unpackhi_epi16(xmm_CbCr_u16_0, xmm_zero); // (uint32_t){ Cb2, Cr2, Cb3, Cr3 } | |
__m128i xmm_CbCr_u32_2 = _mm_unpacklo_epi16(xmm_CbCr_u16_1, xmm_zero); // (uint32_t){ Cb4, Cr4, Cb5, Cr5 } | |
__m128i xmm_CbCr_u32_3 = _mm_unpackhi_epi16(xmm_CbCr_u16_1, xmm_zero); // (uint32_t){ Cb6, Cr6, Cb7, Cr7 } | |
// Shuffle from { Cb, Cr, Cb, Cr } to { Cb, Cb, Cb, Cb } + { Cr, Cr, Cr, Cr } | |
__m128i xmm_Cb_u32_0 = _mm_shuffle_si128(xmm_CbCr_u32_0, xmm_CbCr_u32_1, _MM_SHUFFLE(2, 0, 2, 0)); // (uint32_t){ Cb0, Cb1, Cb2, Cb3 } | |
__m128i xmm_Cr_u32_0 = _mm_shuffle_si128(xmm_CbCr_u32_0, xmm_CbCr_u32_1, _MM_SHUFFLE(3, 1, 3, 1)); // (uint32_t){ Cr0, Cr1, Cr2, Cr3 } | |
__m128i xmm_Cb_u32_1 = _mm_shuffle_si128(xmm_CbCr_u32_2, xmm_CbCr_u32_3, _MM_SHUFFLE(2, 0, 2, 0)); // (uint32_t){ Cb4, Cb5, Cb6, Cb7 } | |
__m128i xmm_Cr_u32_1 = _mm_shuffle_si128(xmm_CbCr_u32_2, xmm_CbCr_u32_3, _MM_SHUFFLE(3, 1, 3, 1)); // (uint32_t){ Cr4, Cr5, Cr6, Cr7 } | |
// Calculate RGB shifts | |
// r_shift = -52224 + 0 * Cb + 409 * Cr | |
// g_shift = 39552 - 100 * Cb - 208 * Cr | |
// b_shift = -65920 + 516 * Cb + 0 * Cr | |
const __m128i xmm_rshift_off = _mm_set1_epi32(-52224); | |
const __m128i xmm_rshift_cr = _mm_set1_epi32(409); | |
__m128i xmm_rshift_0 = _mm_add_epi32(xmm_rshift_off, _mm_mullo_epi32(xmm_rshift_cr, xmm_Cr_u32_0)); // (uint32_t){ r_shift0, r_shift1, r_shift2, r_shift3 } | |
__m128i xmm_rshift_1 = _mm_add_epi32(xmm_rshift_off, _mm_mullo_epi32(xmm_rshift_cr, xmm_Cr_u32_1)); // (uint32_t){ r_shift4, r_shift5, r_shift6, r_shift7 } | |
const __m128i xmm_gshift_off = _mm_set1_epi32(39552); | |
const __m128i xmm_gshift_cb = _mm_set1_epi32(-100); | |
const __m128i xmm_gshift_cr = _mm_set1_epi32(-208); | |
__m128i xmm_gshift_0 = _mm_add_epi32(xmm_gshift_off, _mm_add_epi32(_mm_mullo_epi32(xmm_gshift_cb, xmm_Cb_u32_0), _mm_mullo_epi32(xmm_gshift_cr, xmm_Cr_u32_0))); // (uint32_t){ g_shift0, g_shift1, g_shift2, g_shift3 } | |
__m128i xmm_gshift_1 = _mm_add_epi32(xmm_gshift_off, _mm_add_epi32(_mm_mullo_epi32(xmm_gshift_cb, xmm_Cb_u32_1), _mm_mullo_epi32(xmm_gshift_cr, xmm_Cr_u32_1))); // (uint32_t){ g_shift4, g_shift5, g_shift6, g_shift7 } | |
const __m128i xmm_bshift_off = _mm_set1_epi32(-65920); | |
const __m128i xmm_bshift_cb = _mm_set1_epi32(516); | |
__m128i xmm_bshift_0 = _mm_add_epi32(xmm_bshift_off, _mm_mullo_epi32(xmm_bshift_cb, xmm_Cb_u32_0)); // (uint32_t){ b_shift0, b_shift1, b_shift2, b_shift3 } | |
__m128i xmm_bshift_1 = _mm_add_epi32(xmm_bshift_off, _mm_mullo_epi32(xmm_bshift_cb, xmm_Cb_u32_1)); // (uint32_t){ b_shift4, b_shift5, b_shift6, b_shift7 } | |
// Row #0 | |
{ | |
// Load Y values | |
__m128i xmm_Y_u8 = _mm_loadu_si128((const __m128i*)row0_Y); // (uint8_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 } | |
// Unpack bytes into words | |
__m128i xmm_Y_u16_0 = _mm_unpacklo_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 } | |
__m128i xmm_Y_u16_1 = _mm_unpackhi_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 } | |
// Unpack words into dwords | |
__m128i xmm_Y_u32_0 = _mm_unpacklo_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y0, Y1, Y2, Y3 } | |
__m128i xmm_Y_u32_1 = _mm_unpackhi_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y4, Y5, Y6, Y7 } | |
__m128i xmm_Y_u32_2 = _mm_unpacklo_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y8, Y9, Y10, Y11 } | |
__m128i xmm_Y_u32_3 = _mm_unpackhi_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y12, Y13, Y14, Y15 } | |
// Calculate base color (only Y-dependent) | |
// base = -4768 + 298 * Y | |
const __m128i xmm_base_off = _mm_set1_epi32(-4768); | |
const __m128i xmm_base_Y = _mm_set1_epi32(298); | |
__m128i xmm_base_0 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_0)); // (uint32_t){ base0, base1, base2, base3 } | |
__m128i xmm_base_1 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_1)); // (uint32_t){ base4, base5, base6, base7 } | |
__m128i xmm_base_2 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_2)); // (uint32_t){ base8, base9, base10, base11 } | |
__m128i xmm_base_3 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_3)); // (uint32_t){ base12, base13, base14, base15 } | |
// Calculate R = (base + r_shift) >> 8 | |
__m128i xmm_r_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r0, r1, r2, r3 } | |
__m128i xmm_r_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r4, r5, r6, r7 } | |
__m128i xmm_r_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r8, r9, r10, r11 } | |
__m128i xmm_r_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r12, r13, r14, r15 } | |
// Calculate G = (base + g_shift) >> 8 | |
__m128i xmm_g_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g0, g1, g2, g3 } | |
__m128i xmm_g_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g4, g5, g6, g7 } | |
__m128i xmm_g_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g8, g9, g10, g11 } | |
__m128i xmm_g_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g12, g13, g14, g15 } | |
// Calculate B = (base + b_shift) >> 8 | |
__m128i xmm_b_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b0, b1, b2, b3 } | |
__m128i xmm_b_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b4, b5, b6, b7 } | |
__m128i xmm_b_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b8, b9, b10, b11 } | |
__m128i xmm_b_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b12, b13, b14, b15 } | |
// Clamp RGB results to [0, 255] | |
xmm_r_0 = _mm_saturate8_epi32(xmm_r_0); | |
xmm_r_1 = _mm_saturate8_epi32(xmm_r_1); | |
xmm_r_2 = _mm_saturate8_epi32(xmm_r_2); | |
xmm_r_3 = _mm_saturate8_epi32(xmm_r_3); | |
xmm_g_0 = _mm_saturate8_epi32(xmm_g_0); | |
xmm_g_1 = _mm_saturate8_epi32(xmm_g_1); | |
xmm_g_2 = _mm_saturate8_epi32(xmm_g_2); | |
xmm_g_3 = _mm_saturate8_epi32(xmm_g_3); | |
xmm_b_0 = _mm_saturate8_epi32(xmm_b_0); | |
xmm_b_1 = _mm_saturate8_epi32(xmm_b_1); | |
xmm_b_2 = _mm_saturate8_epi32(xmm_b_2); | |
xmm_b_3 = _mm_saturate8_epi32(xmm_b_3); | |
// Merge R,G,B values into RGBA results | |
const __m128i xmm_a = _mm_slli_epi32(_mm_set1_epi32(255), COLOR32_ALPHA_SHIFT); | |
__m128i xmm_rgba_0 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_0, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_0, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_0, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_1 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_1, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_1, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_1, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_2 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_2, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_2, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_2, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_3 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_3, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_3, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_3, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
// Store results | |
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 0), xmm_rgba_0); | |
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 4), xmm_rgba_1); | |
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 8), xmm_rgba_2); | |
_mm_storeu_si128((__m128i*)(dstRow0_RGBA + 12), xmm_rgba_3); | |
dstRow0_RGBA += 16; | |
row0_Y += 16; | |
} | |
// Row #1 | |
{ | |
// Load Y values | |
__m128i xmm_Y_u8 = _mm_loadu_si128((const __m128i*)row1_Y); // (uint8_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 } | |
// Unpack bytes into words | |
__m128i xmm_Y_u16_0 = _mm_unpacklo_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 } | |
__m128i xmm_Y_u16_1 = _mm_unpackhi_epi8(xmm_Y_u8, xmm_zero); // (uint16_t){ Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15 } | |
// Unpack words into dwords | |
__m128i xmm_Y_u32_0 = _mm_unpacklo_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y0, Y1, Y2, Y3 } | |
__m128i xmm_Y_u32_1 = _mm_unpackhi_epi16(xmm_Y_u16_0, xmm_zero); // (uint32_t){ Y4, Y5, Y6, Y7 } | |
__m128i xmm_Y_u32_2 = _mm_unpacklo_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y8, Y9, Y10, Y11 } | |
__m128i xmm_Y_u32_3 = _mm_unpackhi_epi16(xmm_Y_u16_1, xmm_zero); // (uint32_t){ Y12, Y13, Y14, Y15 } | |
// Calculate base color (only Y-dependent) | |
// base = -4768 + 298 * Y | |
const __m128i xmm_base_off = _mm_set1_epi32(-4768); | |
const __m128i xmm_base_Y = _mm_set1_epi32(298); | |
__m128i xmm_base_0 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_0)); // (uint32_t){ base0, base1, base2, base3 } | |
__m128i xmm_base_1 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_1)); // (uint32_t){ base4, base5, base6, base7 } | |
__m128i xmm_base_2 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_2)); // (uint32_t){ base8, base9, base10, base11 } | |
__m128i xmm_base_3 = _mm_add_epi32(xmm_base_off, _mm_mullo_epi32(xmm_base_Y, xmm_Y_u32_3)); // (uint32_t){ base12, base13, base14, base15 } | |
// Calculate R = (base + r_shift) >> 8 | |
__m128i xmm_r_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r0, r1, r2, r3 } | |
__m128i xmm_r_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_rshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r4, r5, r6, r7 } | |
__m128i xmm_r_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ r8, r9, r10, r11 } | |
__m128i xmm_r_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_rshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( r12, r13, r14, r15 } | |
// Calculate G = (base + g_shift) >> 8 | |
__m128i xmm_g_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g0, g1, g2, g3 } | |
__m128i xmm_g_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_gshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g4, g5, g6, g7 } | |
__m128i xmm_g_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ g8, g9, g10, g11 } | |
__m128i xmm_g_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_gshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( g12, g13, g14, g15 } | |
// Calculate B = (base + b_shift) >> 8 | |
__m128i xmm_b_0 = _mm_srai_epi32(_mm_add_epi32(xmm_base_0, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b0, b1, b2, b3 } | |
__m128i xmm_b_1 = _mm_srai_epi32(_mm_add_epi32(xmm_base_1, _mm_shuffle_epi32(xmm_bshift_0, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b4, b5, b6, b7 } | |
__m128i xmm_b_2 = _mm_srai_epi32(_mm_add_epi32(xmm_base_2, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(1, 1, 0, 0))), 8); // (uint32_t){ b8, b9, b10, b11 } | |
__m128i xmm_b_3 = _mm_srai_epi32(_mm_add_epi32(xmm_base_3, _mm_shuffle_epi32(xmm_bshift_1, _MM_SHUFFLE(3, 3, 2, 2))), 8); // (uint32_t)( b12, b13, b14, b15 } | |
// Clamp RGB results to [0, 255] | |
xmm_r_0 = _mm_saturate8_epi32(xmm_r_0); | |
xmm_r_1 = _mm_saturate8_epi32(xmm_r_1); | |
xmm_r_2 = _mm_saturate8_epi32(xmm_r_2); | |
xmm_r_3 = _mm_saturate8_epi32(xmm_r_3); | |
xmm_g_0 = _mm_saturate8_epi32(xmm_g_0); | |
xmm_g_1 = _mm_saturate8_epi32(xmm_g_1); | |
xmm_g_2 = _mm_saturate8_epi32(xmm_g_2); | |
xmm_g_3 = _mm_saturate8_epi32(xmm_g_3); | |
xmm_b_0 = _mm_saturate8_epi32(xmm_b_0); | |
xmm_b_1 = _mm_saturate8_epi32(xmm_b_1); | |
xmm_b_2 = _mm_saturate8_epi32(xmm_b_2); | |
xmm_b_3 = _mm_saturate8_epi32(xmm_b_3); | |
// Merge R,G,B values into RGBA results | |
const __m128i xmm_a = _mm_slli_epi32(_mm_set1_epi32(255), COLOR32_ALPHA_SHIFT); | |
__m128i xmm_rgba_0 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_0, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_0, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_0, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_1 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_1, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_1, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_1, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_2 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_2, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_2, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_2, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
__m128i xmm_rgba_3 = _mm_or_si128( | |
_mm_or_si128(_mm_slli_epi32(xmm_r_3, COLOR32_RED_SHIFT), _mm_slli_epi32(xmm_g_3, COLOR32_GREEN_SHIFT)), | |
_mm_or_si128(_mm_slli_epi32(xmm_b_3, COLOR32_BLUE_SHIFT), xmm_a) | |
); | |
// Store results | |
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 0), xmm_rgba_0); | |
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 4), xmm_rgba_1); | |
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 8), xmm_rgba_2); | |
_mm_storeu_si128((__m128i*)(dstRow1_RGBA + 12), xmm_rgba_3); | |
dstRow1_RGBA += 16; | |
row1_Y += 16; | |
} | |
row01_CbCr += 16; | |
} | |
pDst += (2 * dstStride); | |
lpBitsY += (2 * srcStride); | |
lpBitsCbCr += srcStride; | |
} | |
} | |
#else // CAMERA_CONFIG_USE_SIMD | |
static inline uint8_t Clip(int clr) | |
{ | |
return (uint8_t)(clr < 0 ? 0 : (clr > 255 ? 255 : clr)); | |
} | |
static inline uint32_t ConvertYCrCbToRGB(int32_t y, int32_t cr, int32_t cb, uint8_t alpha) | |
{ | |
const int32_t c = y - 16; | |
const int32_t d = cb - 128; | |
const int32_t e = cr - 128; | |
return COLOR32(Clip((298 * c + 409 * e + 128) >> 8), Clip((298 * c - 100 * d - 208 * e + 128) >> 8), Clip((298 * c + 516 * d + 128) >> 8), 255); | |
} | |
// https://github.com/pauldotknopf/WindowsSDK7-Samples/blob/master/multimedia/mediafoundation/MFCaptureD3D/device.cpp#L711 | |
static void TransformImage_NV12(BYTE* pDst, LONG dstStride, const BYTE* pSrc, LONG srcStride, DWORD dwWidthInPixels, DWORD dwHeightInPixels) | |
{ | |
const uint8_t* lpBitsY = pSrc; | |
const uint8_t* lpBitsCb = lpBitsY + (dwHeightInPixels * srcStride);; | |
const uint8_t* lpBitsCr = lpBitsCb + 1; | |
for (UINT y = 0; y < dwHeightInPixels; y += 2) { | |
const uint8_t* lpLineY1 = lpBitsY; | |
const uint8_t* lpLineY2 = lpBitsY + srcStride; | |
const uint8_t* lpLineCr = lpBitsCr; | |
const uint8_t* lpLineCb = lpBitsCb; | |
uint32_t* lpDibLine1 = (uint32_t*)pDst; | |
uint32_t* lpDibLine2 = (uint32_t*)(pDst + dstStride); | |
for (UINT x = 0; x < dwWidthInPixels; x += 2) { | |
const int32_t y0 = (int32_t)lpLineY1[0]; | |
const int32_t y1 = (int32_t)lpLineY1[1]; | |
const int32_t y2 = (int32_t)lpLineY2[0]; | |
const int32_t y3 = (int32_t)lpLineY2[1]; | |
const int32_t cb = (int32_t)lpLineCb[0]; | |
const int32_t cr = (int32_t)lpLineCr[0]; | |
lpDibLine1[0] = ConvertYCrCbToRGB(y0, cr, cb, 255); | |
lpDibLine1[1] = ConvertYCrCbToRGB(y1, cr, cb, 255); | |
lpDibLine2[0] = ConvertYCrCbToRGB(y2, cr, cb, 255); | |
lpDibLine2[1] = ConvertYCrCbToRGB(y3, cr, cb, 255); | |
lpLineY1 += 2; | |
lpLineY2 += 2; | |
lpLineCr += 2; | |
lpLineCb += 2; | |
lpDibLine1 += 2; | |
lpDibLine2 += 2; | |
} | |
pDst += (2 * dstStride); | |
lpBitsY += (2 * srcStride); | |
lpBitsCr += srcStride; | |
lpBitsCb += srcStride; | |
} | |
} | |
#endif // CAMERA_CONFIG_USE_SIMD |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment