Skip to content

Instantly share code, notes, and snippets.

@BtbN
Created August 19, 2016 18:26
Show Gist options
  • Save BtbN/35f0e54489d5494628405100b389fe93 to your computer and use it in GitHub Desktop.
Save BtbN/35f0e54489d5494628405100b389fe93 to your computer and use it in GitHub Desktop.
stdin
#include <assert.h>
#include <stdint.h>
#include <smmintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#define CACHED_BUFFER_SIZE 8192
#ifdef _MSC_VER
#define TLS __declspec(thread)
#else
#define TLS __thread
#endif
static TLS uint8_t cacheBlock[CACHED_BUFFER_SIZE + 15];
static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch,
uint8_t *pSrc, uint32_t srcPitch,
uint32_t width, uint32_t height)
{
register __m128i x0 asm ("xmm1");
register __m128i x1 asm ("xmm2");
register __m128i x2 asm ("xmm3");
register __m128i x3 asm ("xmm4");
assert(((uintptr_t)pDest & 0x0f) == 0);
_mm_mfence();
for (uint32_t y = 0; y < height; ++y)
{
const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f;
uint32_t x = 0;
for (; x < unaligned; ++x)
pDest[x] = pSrc[x];
for (; x + 63 < width; x += 64)
{
x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0));
x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16));
x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32));
x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48));
if (!unaligned)
{
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
}
else
{
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
}
}
for (; x < width; ++x)
pDest[x] = pSrc[x];
pDest += dstPitch;
pSrc += srcPitch;
}
}
static void copy_2d(uint8_t *pDest, uint32_t dstPitch,
uint8_t *pSrc, uint32_t srcPitch,
uint32_t width, uint32_t height)
{
register __m128i x0 asm ("xmm1");
register __m128i x1 asm ("xmm2");
register __m128i x2 asm ("xmm3");
register __m128i x3 asm ("xmm4");
assert(((uintptr_t)pSrc & 0x0f) == 0);
_mm_mfence();
for (uint32_t y = 0; y < height; ++y)
{
const uint32_t unaligned = ((uintptr_t)pDest) & 0x0f;
uint32_t x = 0;
for (; x + 63 < width; x += 64)
{
x0 = _mm_load_si128((__m128i*)(&pSrc[x] + 0));
x1 = _mm_load_si128((__m128i*)(&pSrc[x] + 16));
x2 = _mm_load_si128((__m128i*)(&pSrc[x] + 32));
x3 = _mm_load_si128((__m128i*)(&pSrc[x] + 48));
if (!unaligned)
{
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
}
else
{
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
}
}
for (; x < width; ++x)
pDest[x] = pSrc[x];
pDest += dstPitch;
pSrc += srcPitch;
}
}
void copy_frame_sse(uint8_t *pDest, uint32_t dst_pitch,
uint8_t *pSrc, uint32_t src_pitch,
uint32_t width, uint32_t height)
{
const uint32_t w16 = (width + 15) & ~15;
const uint32_t hstep = CACHED_BUFFER_SIZE / w16;
assert(hstep > 0);
uint8_t *cache = cacheBlock;
while ((((uintptr_t)cache) & 0x0f) != 0)
cache += 1;
for (uint32_t y = 0; y < height; y += hstep)
{
uint32_t hblock = hstep;
if (hblock > height - y)
hblock = height - y;
copy_from_uswc(cache, w16,
pSrc, src_pitch,
width, hblock);
copy_2d(pDest, dst_pitch,
cache, w16,
width, hblock);
pSrc += src_pitch * hblock;
pDest += dst_pitch * hblock;
}
_mm_mfence();
}
void av_image_copy_plane(uint8_t *dst, int dst_linesize,
const uint8_t *src, int src_linesize,
int bytewidth, int height)
{
for (; height > 0; height--)
{
memcpy(dst, src, bytewidth);
dst += dst_linesize;
src += src_linesize;
}
}
#include <assert.h>
#include <stdint.h>
#include <smmintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
void copy_frame_sse(uint8_t *dst, int dst_linesize,
const uint8_t *src, int src_linesize,
int width, int height);
void av_image_copy_plane(uint8_t *dst, int dst_linesize,
const uint8_t *src, int src_linesize,
int width, int height);
int main(int argc, char **argv)
{
uint8_t *dummySource;
uint8_t *dummyDestination;
posix_memalign((void**)&dummySource, 16, 2048 * 1080);
posix_memalign((void**)&dummyDestination, 16, 1920 * 1080);
clock_t startTime, endTime;
for (int j = 0; j < 5; ++j)
{
startTime = clock();
for (uint32_t i = 0; i < 100000; ++i)
{
copy_frame_sse(dummyDestination, 1920, dummySource, 2048, 1920, 1080);
}
endTime = clock();
float diff = (((float)endTime - (float)startTime) / CLOCKS_PER_SEC) * 1000;
printf("Milliseconds for sse copy: %f\n", diff);
startTime = clock();
for (uint32_t i = 0; i < 100000; ++i)
{
av_image_copy_plane(dummyDestination, 1920, dummySource, 2048, 1920, 1080);
}
endTime = clock();
diff = (((float)endTime - (float)startTime) / CLOCKS_PER_SEC) * 1000;
printf("Milliseconds for classic copy: %f\n", diff);
}
free(dummyDestination);
free(dummySource);
system("pause");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment