Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#include <assert.h>
#include <stdint.h>
#include <smmintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#define CACHED_BUFFER_SIZE 8192
#ifdef _MSC_VER
#define TLS __declspec(thread)
#else
#define TLS __thread
#endif
static TLS uint8_t cacheBlock[CACHED_BUFFER_SIZE + 15];
static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch,
uint8_t *pSrc, uint32_t srcPitch,
uint32_t width, uint32_t height)
{
register __m128i x0 asm ("xmm1");
register __m128i x1 asm ("xmm2");
register __m128i x2 asm ("xmm3");
register __m128i x3 asm ("xmm4");
assert(((uintptr_t)pDest & 0x0f) == 0);
_mm_mfence();
for (uint32_t y = 0; y < height; ++y)
{
const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f;
uint32_t x = 0;
for (; x < unaligned; ++x)
pDest[x] = pSrc[x];
for (; x + 63 < width; x += 64)
{
x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0));
x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16));
x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32));
x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48));
if (!unaligned)
{
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
}
else
{
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
}
}
for (; x < width; ++x)
pDest[x] = pSrc[x];
pDest += dstPitch;
pSrc += srcPitch;
}
}
static void copy_2d(uint8_t *pDest, uint32_t dstPitch,
uint8_t *pSrc, uint32_t srcPitch,
uint32_t width, uint32_t height)
{
register __m128i x0 asm ("xmm1");
register __m128i x1 asm ("xmm2");
register __m128i x2 asm ("xmm3");
register __m128i x3 asm ("xmm4");
assert(((uintptr_t)pSrc & 0x0f) == 0);
_mm_mfence();
for (uint32_t y = 0; y < height; ++y)
{
const uint32_t unaligned = ((uintptr_t)pDest) & 0x0f;
uint32_t x = 0;
for (; x + 63 < width; x += 64)
{
x0 = _mm_load_si128((__m128i*)(&pSrc[x] + 0));
x1 = _mm_load_si128((__m128i*)(&pSrc[x] + 16));
x2 = _mm_load_si128((__m128i*)(&pSrc[x] + 32));
x3 = _mm_load_si128((__m128i*)(&pSrc[x] + 48));
if (!unaligned)
{
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
}
else
{
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
}
}
for (; x < width; ++x)
pDest[x] = pSrc[x];
pDest += dstPitch;
pSrc += srcPitch;
}
}
void copy_frame_sse(uint8_t *pDest, uint32_t dst_pitch,
uint8_t *pSrc, uint32_t src_pitch,
uint32_t width, uint32_t height)
{
const uint32_t w16 = (width + 15) & ~15;
const uint32_t hstep = CACHED_BUFFER_SIZE / w16;
assert(hstep > 0);
uint8_t *cache = cacheBlock;
while ((((uintptr_t)cache) & 0x0f) != 0)
cache += 1;
for (uint32_t y = 0; y < height; y += hstep)
{
uint32_t hblock = hstep;
if (hblock > height - y)
hblock = height - y;
copy_from_uswc(cache, w16,
pSrc, src_pitch,
width, hblock);
copy_2d(pDest, dst_pitch,
cache, w16,
width, hblock);
pSrc += src_pitch * hblock;
pDest += dst_pitch * hblock;
}
_mm_mfence();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment