Created
August 19, 2016 18:26
-
-
Save BtbN/35f0e54489d5494628405100b389fe93 to your computer and use it in GitHub Desktop.
stdin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdint.h> | |
#include <smmintrin.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <string.h> | |
#define CACHED_BUFFER_SIZE 8192 | |
#ifdef _MSC_VER | |
#define TLS __declspec(thread) | |
#else | |
#define TLS __thread | |
#endif | |
static TLS uint8_t cacheBlock[CACHED_BUFFER_SIZE + 15]; | |
static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch, | |
uint8_t *pSrc, uint32_t srcPitch, | |
uint32_t width, uint32_t height) | |
{ | |
register __m128i x0 asm ("xmm1"); | |
register __m128i x1 asm ("xmm2"); | |
register __m128i x2 asm ("xmm3"); | |
register __m128i x3 asm ("xmm4"); | |
assert(((uintptr_t)pDest & 0x0f) == 0); | |
_mm_mfence(); | |
for (uint32_t y = 0; y < height; ++y) | |
{ | |
const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f; | |
uint32_t x = 0; | |
for (; x < unaligned; ++x) | |
pDest[x] = pSrc[x]; | |
for (; x + 63 < width; x += 64) | |
{ | |
x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0)); | |
x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16)); | |
x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32)); | |
x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48)); | |
if (!unaligned) | |
{ | |
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0); | |
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1); | |
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2); | |
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3); | |
} | |
else | |
{ | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3); | |
} | |
} | |
for (; x < width; ++x) | |
pDest[x] = pSrc[x]; | |
pDest += dstPitch; | |
pSrc += srcPitch; | |
} | |
} | |
static void copy_2d(uint8_t *pDest, uint32_t dstPitch, | |
uint8_t *pSrc, uint32_t srcPitch, | |
uint32_t width, uint32_t height) | |
{ | |
register __m128i x0 asm ("xmm1"); | |
register __m128i x1 asm ("xmm2"); | |
register __m128i x2 asm ("xmm3"); | |
register __m128i x3 asm ("xmm4"); | |
assert(((uintptr_t)pSrc & 0x0f) == 0); | |
_mm_mfence(); | |
for (uint32_t y = 0; y < height; ++y) | |
{ | |
const uint32_t unaligned = ((uintptr_t)pDest) & 0x0f; | |
uint32_t x = 0; | |
for (; x + 63 < width; x += 64) | |
{ | |
x0 = _mm_load_si128((__m128i*)(&pSrc[x] + 0)); | |
x1 = _mm_load_si128((__m128i*)(&pSrc[x] + 16)); | |
x2 = _mm_load_si128((__m128i*)(&pSrc[x] + 32)); | |
x3 = _mm_load_si128((__m128i*)(&pSrc[x] + 48)); | |
if (!unaligned) | |
{ | |
_mm_store_si128((__m128i*)(&pDest[x] + 0), x0); | |
_mm_store_si128((__m128i*)(&pDest[x] + 16), x1); | |
_mm_store_si128((__m128i*)(&pDest[x] + 32), x2); | |
_mm_store_si128((__m128i*)(&pDest[x] + 48), x3); | |
} | |
else | |
{ | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2); | |
_mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3); | |
} | |
} | |
for (; x < width; ++x) | |
pDest[x] = pSrc[x]; | |
pDest += dstPitch; | |
pSrc += srcPitch; | |
} | |
} | |
void copy_frame_sse(uint8_t *pDest, uint32_t dst_pitch, | |
uint8_t *pSrc, uint32_t src_pitch, | |
uint32_t width, uint32_t height) | |
{ | |
const uint32_t w16 = (width + 15) & ~15; | |
const uint32_t hstep = CACHED_BUFFER_SIZE / w16; | |
assert(hstep > 0); | |
uint8_t *cache = cacheBlock; | |
while ((((uintptr_t)cache) & 0x0f) != 0) | |
cache += 1; | |
for (uint32_t y = 0; y < height; y += hstep) | |
{ | |
uint32_t hblock = hstep; | |
if (hblock > height - y) | |
hblock = height - y; | |
copy_from_uswc(cache, w16, | |
pSrc, src_pitch, | |
width, hblock); | |
copy_2d(pDest, dst_pitch, | |
cache, w16, | |
width, hblock); | |
pSrc += src_pitch * hblock; | |
pDest += dst_pitch * hblock; | |
} | |
_mm_mfence(); | |
} | |
void av_image_copy_plane(uint8_t *dst, int dst_linesize, | |
const uint8_t *src, int src_linesize, | |
int bytewidth, int height) | |
{ | |
for (; height > 0; height--) | |
{ | |
memcpy(dst, src, bytewidth); | |
dst += dst_linesize; | |
src += src_linesize; | |
} | |
} | |
#include <assert.h> | |
#include <stdint.h> | |
#include <smmintrin.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <string.h> | |
void copy_frame_sse(uint8_t *dst, int dst_linesize, | |
const uint8_t *src, int src_linesize, | |
int width, int height); | |
void av_image_copy_plane(uint8_t *dst, int dst_linesize, | |
const uint8_t *src, int src_linesize, | |
int width, int height); | |
int main(int argc, char **argv) | |
{ | |
uint8_t *dummySource; | |
uint8_t *dummyDestination; | |
posix_memalign((void**)&dummySource, 16, 2048 * 1080); | |
posix_memalign((void**)&dummyDestination, 16, 1920 * 1080); | |
clock_t startTime, endTime; | |
for (int j = 0; j < 5; ++j) | |
{ | |
startTime = clock(); | |
for (uint32_t i = 0; i < 100000; ++i) | |
{ | |
copy_frame_sse(dummyDestination, 1920, dummySource, 2048, 1920, 1080); | |
} | |
endTime = clock(); | |
float diff = (((float)endTime - (float)startTime) / CLOCKS_PER_SEC) * 1000; | |
printf("Milliseconds for sse copy: %f\n", diff); | |
startTime = clock(); | |
for (uint32_t i = 0; i < 100000; ++i) | |
{ | |
av_image_copy_plane(dummyDestination, 1920, dummySource, 2048, 1920, 1080); | |
} | |
endTime = clock(); | |
diff = (((float)endTime - (float)startTime) / CLOCKS_PER_SEC) * 1000; | |
printf("Milliseconds for classic copy: %f\n", diff); | |
} | |
free(dummyDestination); | |
free(dummySource); | |
system("pause"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment