Skip to content

Instantly share code, notes, and snippets.

@3outeille
Created September 6, 2022 17:55
Show Gist options
  • Save 3outeille/7e91686b44dba99e93d11ac428b9f26f to your computer and use it in GitHub Desktop.
Save 3outeille/7e91686b44dba99e93d11ac428b9f26f to your computer and use it in GitHub Desktop.
fg_simulate_grain_blk8x8 sse4
void fg_simulate_grain_blk8x8_sse4(int32_t *grainStripe, uint32_t grainStripeOffsetBlk8,
uint32_t width, uint8_t log2ScaleFactor, int16_t scaleFactor, uint32_t kOffset, uint32_t lOffset, uint8_t h, uint8_t v, uint32_t xSize)
{
uint32_t idx_offset_l1, idx_offset_l2, idx_offset_l3, idx_offset_l4;
uint32_t grainStripeOffsetBlk8_l1, grainStripeOffsetBlk8_l2, grainStripeOffsetBlk8_l3, grainStripeOffsetBlk8_l4;
uint32_t idx_offset = ( h*NUM_CUT_OFF_FREQ + v ) * DATA_BASE_SIZE * DATA_BASE_SIZE;
__m128i scale = _mm_set_epi32(scaleFactor, scaleFactor, scaleFactor, scaleFactor);
for (uint32_t l = 0; l < 8; l+=4) {
idx_offset_l1 = idx_offset + (l + lOffset) * DATA_BASE_SIZE;
idx_offset_l2 = idx_offset + (l + 1 + lOffset) * DATA_BASE_SIZE;
idx_offset_l3 = idx_offset + (l + 2 + lOffset) * DATA_BASE_SIZE;
idx_offset_l4 = idx_offset + (l + 3 + lOffset) * DATA_BASE_SIZE;
grainStripeOffsetBlk8_l1 = grainStripeOffsetBlk8 + (l*width);
grainStripeOffsetBlk8_l2 = grainStripeOffsetBlk8 + ((l + 1)*width);
grainStripeOffsetBlk8_l3 = grainStripeOffsetBlk8 + ((l + 2)*width);
grainStripeOffsetBlk8_l4 = grainStripeOffsetBlk8 + ((l + 3)*width);
for (uint32_t k = 0; k < xSize; k+=4)
{
__m128i fg_data_1 = _mm_loadu_si64(fg_data_base + idx_offset_l1 + (k + kOffset));
__m128i fg_data_1_lo = _mm_cvtepi8_epi32(fg_data_1);
__m128i chunk_1 = _mm_srai_epi32(_mm_mullo_epi32(fg_data_1_lo, scale), log2ScaleFactor + GRAIN_SCALE);
_mm_store_si128(&grainStripe[grainStripeOffsetBlk8_l1 + k], chunk_1);
__m128i fg_data_2 = _mm_loadu_si64(fg_data_base + idx_offset_l2 + (k + kOffset));
__m128i fg_data_2_lo = _mm_cvtepi8_epi32(fg_data_2);
__m128i chunk_2 = _mm_srai_epi32(_mm_mullo_epi32(fg_data_2_lo, scale), log2ScaleFactor + GRAIN_SCALE);
_mm_store_si128(&grainStripe[grainStripeOffsetBlk8_l2 + k], chunk_2);
__m128i fg_data_3 = _mm_loadu_si64(fg_data_base + idx_offset_l3 + (k + kOffset));
__m128i fg_data_3_lo = _mm_cvtepi8_epi32(fg_data_3);
__m128i chunk_3 = _mm_srai_epi32(_mm_mullo_epi32(fg_data_3_lo, scale), log2ScaleFactor + GRAIN_SCALE);
_mm_store_si128(&grainStripe[grainStripeOffsetBlk8_l3 + k], chunk_3);
__m128i fg_data_4 = _mm_loadu_si64(fg_data_base + idx_offset_l4 + (k + kOffset));
__m128i fg_data_4_lo = _mm_cvtepi8_epi32(fg_data_4);
__m128i chunk_4 = _mm_srai_epi32(_mm_mullo_epi32(fg_data_4_lo, scale), log2ScaleFactor + GRAIN_SCALE);
_mm_store_si128(&grainStripe[grainStripeOffsetBlk8_l4 + k], chunk_4);
}
return;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment