TimoRoth/uswc_copy.c

## uswc_copy.c

#include <assert.h>
#include <stdint.h>
#include <smmintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

#define CACHED_BUFFER_SIZE 8192

#ifdef _MSC_VER
#define TLS __declspec(thread)
#else
#define TLS __thread
#endif

static TLS uint8_t cacheBlock[CACHED_BUFFER_SIZE + 15];

static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch,
        uint8_t *pSrc, uint32_t srcPitch,
        uint32_t width, uint32_t height)
{
        register __m128i x0 asm ("xmm1");
        register __m128i x1 asm ("xmm2");
        register __m128i x2 asm ("xmm3");
        register __m128i x3 asm ("xmm4");

        assert(((uintptr_t)pDest & 0x0f) == 0);

        _mm_mfence();

        for (uint32_t y = 0; y < height; ++y)
        {
                const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f;
                uint32_t x = 0;

                for (; x < unaligned; ++x)
                        pDest[x] = pSrc[x];

                for (; x + 63 < width; x += 64)
                {
                        x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0));
                        x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16));
                        x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32));
                        x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48));

                        if (!unaligned)
                        {
                                _mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
                                _mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
                                _mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
                                _mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
                        }
                        else
                        {
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
                        }
                }

                for (; x < width; ++x)
                        pDest[x] = pSrc[x];

                pDest += dstPitch;
                pSrc += srcPitch;
        }
}

static void copy_2d(uint8_t *pDest, uint32_t dstPitch,
        uint8_t *pSrc, uint32_t srcPitch,
        uint32_t width, uint32_t height)
{
        register __m128i x0 asm ("xmm1");
        register __m128i x1 asm ("xmm2");
        register __m128i x2 asm ("xmm3");
        register __m128i x3 asm ("xmm4");

        assert(((uintptr_t)pSrc & 0x0f) == 0);

        _mm_mfence();

        for (uint32_t y = 0; y < height; ++y)
        {
                const uint32_t unaligned = ((uintptr_t)pDest) & 0x0f;
                uint32_t x = 0;

                for (; x + 63 < width; x += 64)
                {
                        x0 = _mm_load_si128((__m128i*)(&pSrc[x] + 0));
                        x1 = _mm_load_si128((__m128i*)(&pSrc[x] + 16));
                        x2 = _mm_load_si128((__m128i*)(&pSrc[x] + 32));
                        x3 = _mm_load_si128((__m128i*)(&pSrc[x] + 48));

                        if (!unaligned)
                        {
                                _mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
                                _mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
                                _mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
                                _mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
                        }
                        else
                        {
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
                                _mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
                        }
                }

                for (; x < width; ++x)
                        pDest[x] = pSrc[x];

                pDest += dstPitch;
                pSrc += srcPitch;
        }

}

void copy_frame_sse(uint8_t *pDest, uint32_t dst_pitch,
        uint8_t *pSrc, uint32_t src_pitch,
        uint32_t width, uint32_t height)
{
        const uint32_t w16 = (width + 15) & ~15;
        const uint32_t hstep = CACHED_BUFFER_SIZE / w16;
        assert(hstep > 0);

        uint8_t *cache = cacheBlock;
        while ((((uintptr_t)cache) & 0x0f) != 0)
                cache += 1;

        for (uint32_t y = 0; y < height; y += hstep)
        {
                uint32_t hblock = hstep;
                if (hblock > height - y)
                        hblock = height - y;

                copy_from_uswc(cache, w16,
                        pSrc, src_pitch,
                        width, hblock);

                copy_2d(pDest, dst_pitch,
                        cache, w16,
                        width, hblock);

                pSrc += src_pitch * hblock;
                pDest += dst_pitch * hblock;
        }

        _mm_mfence();
}

	#include <assert.h>
	#include <stdint.h>
	#include <smmintrin.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <time.h>
	#include <string.h>

	#define CACHED_BUFFER_SIZE 8192

	#ifdef _MSC_VER
	#define TLS __declspec(thread)
	#else
	#define TLS __thread
	#endif

	static TLS uint8_t cacheBlock[CACHED_BUFFER_SIZE + 15];

	static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch,
	uint8_t *pSrc, uint32_t srcPitch,
	uint32_t width, uint32_t height)
	{
	register __m128i x0 asm ("xmm1");
	register __m128i x1 asm ("xmm2");
	register __m128i x2 asm ("xmm3");
	register __m128i x3 asm ("xmm4");

	assert(((uintptr_t)pDest & 0x0f) == 0);

	_mm_mfence();

	for (uint32_t y = 0; y < height; ++y)
	{
	const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f;
	uint32_t x = 0;

	for (; x < unaligned; ++x)
	pDest[x] = pSrc[x];

	for (; x + 63 < width; x += 64)
	{
	x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0));
	x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16));
	x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32));
	x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48));

	if (!unaligned)
	{
	_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
	_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
	_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
	_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
	}
	else
	{
	_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
	}
	}

	for (; x < width; ++x)
	pDest[x] = pSrc[x];

	pDest += dstPitch;
	pSrc += srcPitch;
	}
	}

	static void copy_2d(uint8_t *pDest, uint32_t dstPitch,
	uint8_t *pSrc, uint32_t srcPitch,
	uint32_t width, uint32_t height)
	{
	register __m128i x0 asm ("xmm1");
	register __m128i x1 asm ("xmm2");
	register __m128i x2 asm ("xmm3");
	register __m128i x3 asm ("xmm4");

	assert(((uintptr_t)pSrc & 0x0f) == 0);

	_mm_mfence();

	for (uint32_t y = 0; y < height; ++y)
	{
	const uint32_t unaligned = ((uintptr_t)pDest) & 0x0f;
	uint32_t x = 0;

	for (; x + 63 < width; x += 64)
	{
	x0 = _mm_load_si128((__m128i*)(&pSrc[x] + 0));
	x1 = _mm_load_si128((__m128i*)(&pSrc[x] + 16));
	x2 = _mm_load_si128((__m128i*)(&pSrc[x] + 32));
	x3 = _mm_load_si128((__m128i*)(&pSrc[x] + 48));

	if (!unaligned)
	{
	_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
	_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
	_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
	_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
	}
	else
	{
	_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
	_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
	}
	}

	for (; x < width; ++x)
	pDest[x] = pSrc[x];

	pDest += dstPitch;
	pSrc += srcPitch;
	}

	}

	void copy_frame_sse(uint8_t *pDest, uint32_t dst_pitch,
	uint8_t *pSrc, uint32_t src_pitch,
	uint32_t width, uint32_t height)
	{
	const uint32_t w16 = (width + 15) & ~15;
	const uint32_t hstep = CACHED_BUFFER_SIZE / w16;
	assert(hstep > 0);

	uint8_t *cache = cacheBlock;
	while ((((uintptr_t)cache) & 0x0f) != 0)
	cache += 1;

	for (uint32_t y = 0; y < height; y += hstep)
	{
	uint32_t hblock = hstep;
	if (hblock > height - y)
	hblock = height - y;

	copy_from_uswc(cache, w16,
	pSrc, src_pitch,
	width, hblock);

	copy_2d(pDest, dst_pitch,
	cache, w16,
	width, hblock);

	pSrc += src_pitch * hblock;
	pDest += dst_pitch * hblock;
	}

	_mm_mfence();
	}