Skip to content

Instantly share code, notes, and snippets.

@BtbN
Created August 19, 2016 19:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BtbN/56be86b002a21db35de5a4b66f78c483 to your computer and use it in GitHub Desktop.
Save BtbN/56be86b002a21db35de5a4b66f78c483 to your computer and use it in GitHub Desktop.
stdin
diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c
index 37808e5..96e94bf 100644
--- a/libavutil/imgutils.c
+++ b/libavutil/imgutils.c
@@ -31,6 +31,8 @@
#include "pixdesc.h"
#include "rational.h"
+#include <smmintrin.h>
+
void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4],
const AVPixFmtDescriptor *pixdesc)
{
@@ -284,6 +286,58 @@ int av_image_check_sar(unsigned int w, unsigned int h, AVRational sar)
return AVERROR(EINVAL);
}
+static void copy_from_uswc(uint8_t *pDest, uint32_t dstPitch,
+ const uint8_t *pSrc, uint32_t srcPitch,
+ uint32_t width, uint32_t height)
+{
+ register __m128i x0 __asm ("xmm1");
+ register __m128i x1 __asm ("xmm2");
+ register __m128i x2 __asm ("xmm3");
+ register __m128i x3 __asm ("xmm4");
+
+ uint32_t y;
+
+ _mm_mfence();
+
+ for (y = 0; y < height; ++y)
+ {
+ const uint32_t unaligned = ((uintptr_t)pSrc) & 0x0f;
+ uint32_t x = 0;
+
+ for (; x < unaligned; ++x)
+ pDest[x] = pSrc[x];
+
+ for (; x + 63 < width; x += 64)
+ {
+ x0 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 0));
+ x1 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 16));
+ x2 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 32));
+ x3 = _mm_stream_load_si128((__m128i*)(&pSrc[x] + 48));
+
+ if (!unaligned)
+ {
+ _mm_store_si128((__m128i*)(&pDest[x] + 0), x0);
+ _mm_store_si128((__m128i*)(&pDest[x] + 16), x1);
+ _mm_store_si128((__m128i*)(&pDest[x] + 32), x2);
+ _mm_store_si128((__m128i*)(&pDest[x] + 48), x3);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 0), x0);
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 16), x1);
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 32), x2);
+ _mm_storeu_si128((__m128i*)(&pDest[x] + 48), x3);
+ }
+ }
+
+ for (; x < width; ++x)
+ pDest[x] = pSrc[x];
+
+ pDest += dstPitch;
+ pSrc += srcPitch;
+ }
+}
+
void av_image_copy_plane(uint8_t *dst, int dst_linesize,
const uint8_t *src, int src_linesize,
int bytewidth, int height)
@@ -292,11 +346,15 @@ void av_image_copy_plane(uint8_t *dst, int dst_linesize,
return;
av_assert0(abs(src_linesize) >= bytewidth);
av_assert0(abs(dst_linesize) >= bytewidth);
+#if 1
+ copy_from_uswc(dst, dst_linesize, src, src_linesize, bytewidth, height);
+#else
for (;height > 0; height--) {
memcpy(dst, src, bytewidth);
dst += dst_linesize;
src += src_linesize;
}
+#endif
}
void av_image_copy(uint8_t *dst_data[4], int dst_linesizes[4],
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment