Created
August 19, 2016 19:18
-
-
Save BtbN/56be86b002a21db35de5a4b66f78c483 to your computer and use it in GitHub Desktop.
stdin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c | |
index 37808e5..96e94bf 100644 | |
--- a/libavutil/imgutils.c | |
+++ b/libavutil/imgutils.c | |
@@ -31,6 +31,8 @@ | |
#include "pixdesc.h" | |
#include "rational.h" | |
+#include <smmintrin.h> | |
+ | |
void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], | |
const AVPixFmtDescriptor *pixdesc) | |
{ | |
@@ -284,6 +286,58 @@ int av_image_check_sar(unsigned int w, unsigned int h, AVRational sar) | |
return AVERROR(EINVAL); | |
} | |
+static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch, | |
+ const uint8_t *pSrc, uint32_t srcPitch, | |
+ uint32_t width, uint32_t height) | |
+{ | |
+ register __m128i x0 __asm ("xmm1"); | |
+ register __m128i x1 __asm ("xmm2"); | |
+ register __m128i x2 __asm ("xmm3"); | |
+ register __m128i x3 __asm ("xmm4"); | |
+ | |
+ uint32_t y; | |
+ | |
+ _mm_mfence(); | |
+ | |
+ for (y = 0; y < height; ++y) | |
+ { | |
+ const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f; | |
+ uint32_t x = 0; | |
+ | |
+ for (; x < unaligned; ++x) | |
+ pDest[x] = pSrc[x]; | |
+ | |
+ for (; x + 63 < width; x += 64) | |
+ { | |
+ x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0)); | |
+ x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16)); | |
+ x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32)); | |
+ x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48)); | |
+ | |
+ if (!unaligned) | |
+ { | |
+ _mm_store_si128((__m128i*)(&pDest[x] + 0), x0); | |
+ _mm_store_si128((__m128i*)(&pDest[x] + 16), x1); | |
+ _mm_store_si128((__m128i*)(&pDest[x] + 32), x2); | |
+ _mm_store_si128((__m128i*)(&pDest[x] + 48), x3); | |
+ } | |
+ else | |
+ { | |
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0); | |
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1); | |
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2); | |
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3); | |
+ } | |
+ } | |
+ | |
+ for (; x < width; ++x) | |
+ pDest[x] = pSrc[x]; | |
+ | |
+ pDest += dstPitch; | |
+ pSrc += srcPitch; | |
+ } | |
+} | |
+ | |
void av_image_copy_plane(uint8_t *dst, int dst_linesize, | |
const uint8_t *src, int src_linesize, | |
int bytewidth, int height) | |
@@ -292,11 +346,15 @@ void av_image_copy_plane(uint8_t *dst, int dst_linesize, | |
return; | |
av_assert0(abs(src_linesize) >= bytewidth); | |
av_assert0(abs(dst_linesize) >= bytewidth); | |
+#if 1 | |
+ copy_from_uswc(dst, dst_linesize, src, src_linesize, bytewidth, height); | |
+#else | |
for (;height > 0; height--) { | |
memcpy(dst, src, bytewidth); | |
dst += dst_linesize; | |
src += src_linesize; | |
} | |
+#endif | |
} | |
void av_image_copy(uint8_t *dst_data[4], int dst_linesizes[4], |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment