Skip to content

Instantly share code, notes, and snippets.

@rcombs
Created February 17, 2016 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcombs/541a7715a6213f71f91a to your computer and use it in GitHub Desktop.
Save rcombs/541a7715a6213f71f91a to your computer and use it in GitHub Desktop.
From 69503a3bc2874fc942ba237d5c437032eea86774 Mon Sep 17 00:00:00 2001
From: Tsahee Zidenberg <tsahee@annapurnalabs.com>
Date: Thu, 5 Nov 2015 15:07:17 +0200
Subject: [PATCH] libswscale: add neon implementations
This adds neon optimization (for both arm and aarch64) to libswscale.
---
libswscale/neon/Makefile | 4 +
libswscale/neon/output_neon.c | 458 +++++++++++++++++++++
libswscale/neon/swscale_neon.c | 892 ++++++++++++++++++++++++++++++++++++++++
libswscale/output.c | 2 +-
libswscale/swscale.c | 8 +
libswscale/swscale_internal.h | 8 +
6 files changed, 1371 insertions(+), 1 deletion(-)
create mode 100644 libswscale/neon/Makefile
create mode 100644 libswscale/neon/output_neon.c
create mode 100644 libswscale/neon/swscale_neon.c
diff --git a/libswscale/neon/Makefile b/libswscale/neon/Makefile
new file mode 100644
index 0000000..d667c6d
--- /dev/null
+++ b/libswscale/neon/Makefile
@@ -0,0 +1,4 @@
+OBJS += neon/swscale_neon.o
+OBJS += neon/output_neon.o
+
+CFLAGS += -flax-vector-conversions
diff --git a/libswscale/neon/output_neon.c b/libswscale/neon/output_neon.c
new file mode 100644
index 0000000..5692b67
--- /dev/null
+++ b/libswscale/neon/output_neon.c
@@ -0,0 +1,458 @@
+/*
+ * Neon-enhanced yuv2planeX
+ *
+ * based on the equivalent C code in libswscale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//Arm reference
+//http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHJBEFE.html
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include <arm_neon.h>
+#include <stdio.h>
+
+// almost-generic (filtersizes up to 12) neon optimized function,
+// it has many conditions on filtersize. inlining it with a constant filtersize
+// should be able to remove all branches except for the for-loop
+static inline void yuv2planeX_8_neon_template(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
+{
+ int i;
+ int32x4_t valsA, valsB;
+ int16x4_t destA_t, destB_t;
+ int16x8_t dest_t;
+ uint8x8_t destVec;
+ int32x4_t ditherA, ditherB;
+ int16x4_t srcLoader;
+ int16x4_t filterVecA, filterVecB, filterVecC;
+
+ ditherA = vdupq_n_u32(0);
+ ditherB = vdupq_n_u32(0);
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0);
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1);
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2);
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3);
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0);
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1);
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2);
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3);
+
+ filterVecA = vld1_s16(filter);
+ if (filterSize > 4)
+ filterVecB = vld1_s16(filter + 4);
+ if (filterSize > 8)
+ filterVecC = vld1_s16(filter + 8);
+
+ if (filterSize < 12)
+ filterVecC = vset_lane_s16(0, filterVecC, 3);
+ if (filterSize < 11)
+ filterVecC = vset_lane_s16(0, filterVecC, 2);
+ if (filterSize < 10)
+ filterVecC = vset_lane_s16(0, filterVecC, 1);
+ if (filterSize < 9)
+ filterVecC = vset_lane_s16(0, filterVecC, 0);
+ if (filterSize < 8)
+ filterVecB = vset_lane_s16(0, filterVecB, 3);
+ if (filterSize < 7)
+ filterVecB = vset_lane_s16(0, filterVecB, 2);
+ if (filterSize < 6)
+ filterVecB = vset_lane_s16(0, filterVecB, 1);
+ if (filterSize < 5)
+ filterVecB = vset_lane_s16(0, filterVecB, 0);
+ if (filterSize < 4)
+ filterVecA = vset_lane_s16(0, filterVecA, 3);
+ if (filterSize < 3)
+ filterVecA = vset_lane_s16(0, filterVecA, 2);
+ if (filterSize < 2)
+ filterVecA = vset_lane_s16(0, filterVecA, 1);
+
+ for (i = 0; i < dstW; i += 8) {
+ valsA = ditherA;
+ valsB = ditherB;
+ srcLoader = vld1_s16(src[0] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 0);
+ srcLoader = vld1_s16(src[0] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 0);
+ if (filterSize > 1) {
+ srcLoader = vld1_s16(src[1] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 1);
+ srcLoader = vld1_s16(src[1] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 1);
+ }
+ if (filterSize > 2) {
+ srcLoader = vld1_s16(src[2] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 2);
+ srcLoader = vld1_s16(src[2] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 2);
+ }
+ if (filterSize > 3) {
+ srcLoader = vld1_s16(src[3] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 3);
+ srcLoader = vld1_s16(src[3] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 3);
+ }
+ if (filterSize > 4) {
+ srcLoader = vld1_s16(src[4] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 0);
+ srcLoader = vld1_s16(src[4] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 0);
+ }
+ if (filterSize > 5) {
+ srcLoader = vld1_s16(src[5] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 1);
+ srcLoader = vld1_s16(src[5] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 1);
+ }
+ if (filterSize > 6) {
+ srcLoader = vld1_s16(src[6] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 2);
+ srcLoader = vld1_s16(src[6] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 2);
+ }
+ if (filterSize > 7) {
+ srcLoader = vld1_s16(src[7] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 3);
+ srcLoader = vld1_s16(src[7] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 3);
+ }
+ if (filterSize > 8) {
+ srcLoader = vld1_s16(src[8] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 0);
+ srcLoader = vld1_s16(src[8] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 0);
+ }
+ if (filterSize > 9) {
+ srcLoader = vld1_s16(src[9] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 1);
+ srcLoader = vld1_s16(src[9] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 1);
+ }
+ if (filterSize > 10) {
+ srcLoader = vld1_s16(src[10] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 2);
+ srcLoader = vld1_s16(src[10] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 2);
+ }
+ if (filterSize > 11) {
+ srcLoader = vld1_s16(src[11] + i);
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 3);
+ srcLoader = vld1_s16(src[11] + i + 4);
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 3);
+ }
+
+ valsA = vshrq_n_s32(valsA, 16);
+ valsB = vshrq_n_s32(valsB, 16);
+ destA_t = vshrn_n_s32(valsA, 3);
+ destB_t = vshrn_n_s32(valsB, 3);
+ dest_t = vcombine_s16(destA_t, destB_t);
+ destVec = vqmovun_s16(dest_t);
+ vst1_u8(dest + i, destVec);
+ }
+}
+
+//bilinear - Vertical 3 taps filter 1080p -> 720p (1080 -> 720)
+static inline void yuv2planeX_8_neon_v3(const int16_t *filter, const int16_t **src,
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset)
+{
+ int i;
+ int32x4_t valsA, valsB;
+ int16x4_t destA_t, destB_t;
+ int16x8_t dest_t;
+ uint8x8_t destVec;
+ int32x4_t ditherA, ditherB;
+ int16x4_t filterVecA;
+ int16x8_t srcLoader0_16x8;
+ int16x8_t srcLoader1_16x8;
+ int16x8_t srcLoader2_16x8;
+
+ //preload to hide load latency
+ srcLoader0_16x8 = vld1q_s16(src[0]);
+ srcLoader1_16x8 = vld1q_s16(src[1]);
+ srcLoader2_16x8 = vld1q_s16(src[2]);
+
+ ditherA = vdupq_n_u32(0);
+ ditherB = vdupq_n_u32(0);
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0);
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1);
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2);
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3);
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0);
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1);
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2);
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3);
+
+ //load the filter
+ filterVecA = vld1_s16(filter);
+ filterVecA = vset_lane_s16(0, filterVecA, 3);
+
+ for (i = 0; i < dstW; i += 8) {
+
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0);
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2);
+
+ //load for next loop iteration
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8);
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8);
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8);
+
+ //this looks good
+ valsA = vshrq_n_s32(valsA, 16);
+ valsB = vshrq_n_s32(valsB, 16);
+ destA_t = vshrn_n_s32(valsA, 3);
+ destB_t = vshrn_n_s32(valsB, 3);
+ dest_t = vcombine_s16(destA_t, destB_t);
+ destVec = vqmovun_s16(dest_t);
+ vst1_u8(dest + i, destVec);
+ }
+}
+
+//bicubic - Vertical 6 taps filter 1080p -> 720p (1080 -> 720)
+static void yuv2planeX_8_neon_v6(const int16_t * filter, const int16_t ** src,
+ uint8_t * restrict dest, int dstW, const uint8_t *dither, int offset)
+{
+ int i;
+ int32x4_t valsA, valsB;
+ int16x4_t destA_t, destB_t;
+ int16x8_t dest_t;
+ uint8x8_t destVec;
+ int32x4_t ditherA, ditherB;
+ int16x4_t filterVecA, filterVecB;
+ int16x8_t srcLoader0_16x8;
+ int16x8_t srcLoader1_16x8;
+ int16x8_t srcLoader2_16x8;
+ int16x8_t srcLoader3_16x8;
+ int16x8_t srcLoader4_16x8;
+ int16x8_t srcLoader5_16x8;
+
+ ditherA = vdupq_n_u32(0);
+ ditherB = vdupq_n_u32(0);
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0);
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1);
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2);
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3);
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0);
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1);
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2);
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3);
+
+ //load the filters
+ filterVecA = vld1_s16(filter);
+ filterVecB = vld1_s16(filter + 4);
+ filterVecB = vset_lane_s16(0, filterVecB, 3);
+ filterVecB = vset_lane_s16(0, filterVecB, 2);
+
+ //preload to hide load latency
+ srcLoader0_16x8 = vld1q_s16(src[0]);
+ srcLoader1_16x8 = vld1q_s16(src[1]);
+ srcLoader2_16x8 = vld1q_s16(src[2]);
+ srcLoader3_16x8 = vld1q_s16(src[3]);
+ srcLoader4_16x8 = vld1q_s16(src[4]);
+ srcLoader5_16x8 = vld1q_s16(src[5]);
+
+ for (i = 0; i < dstW; i += 8) {
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0);
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1);
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8); //load for next loop iteration, hides latency
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader3_16x8), filterVecA, 3);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader3_16x8), filterVecA, 3);
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8);
+ srcLoader3_16x8 = vld1q_s16(src[3] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader4_16x8), filterVecB, 0);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader4_16x8), filterVecB, 0);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader5_16x8), filterVecB, 1);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader5_16x8), filterVecB, 1);
+ srcLoader4_16x8 = vld1q_s16(src[4] + i + 8);
+ srcLoader5_16x8 = vld1q_s16(src[5] + i + 8);
+
+ valsA = vshrq_n_s32(valsA, 16);
+ valsB = vshrq_n_s32(valsB, 16);
+ destA_t = vshrn_n_s32(valsA, 3);
+ destB_t = vshrn_n_s32(valsB, 3);
+ dest_t = vcombine_s16(destA_t, destB_t);
+ destVec = vqmovun_s16(dest_t);
+ vst1_u8(dest + i, destVec);
+ }
+}
+
+//bicubic - Vertical 9 taps filter 1080p -> 480p (1080 -> 480)
+static void yuv2planeX_8_neon_v9(const int16_t * filter, const int16_t ** src,
+ uint8_t * restrict dest, int dstW, const uint8_t *dither, int offset)
+{
+ int i;
+ int32x4_t valsA, valsB;
+ int16x4_t destA_t, destB_t;
+ int16x8_t dest_t;
+ uint8x8_t destVec;
+ int32x4_t ditherA, ditherB;
+ int16x4_t filterVecA, filterVecB, filterVecC;
+ int16x8_t srcLoader0_16x8;
+ int16x8_t srcLoader1_16x8;
+ int16x8_t srcLoader2_16x8;
+ int16x8_t srcLoader3_16x8;
+ int16x8_t srcLoader4_16x8;
+ int16x8_t srcLoader5_16x8;
+ int16x8_t srcLoader6_16x8;
+ int16x8_t srcLoader7_16x8;
+ int16x8_t srcLoader8_16x8;
+
+ ditherA = vdupq_n_u32(0);
+ ditherB = vdupq_n_u32(0);
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0);
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1);
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2);
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3);
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0);
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1);
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2);
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3);
+
+ filterVecA = vld1_s16(filter);
+ filterVecB = vld1_s16(filter + 4);
+ filterVecC = vld1_s16(filter + 8);
+ filterVecC = vset_lane_s16(0, filterVecC, 3);
+ filterVecC = vset_lane_s16(0, filterVecC, 2);
+ filterVecC = vset_lane_s16(0, filterVecC, 1);
+
+ //preload to hide load latency
+ srcLoader0_16x8 = vld1q_s16(src[0]);
+ srcLoader1_16x8 = vld1q_s16(src[1]);
+ srcLoader2_16x8 = vld1q_s16(src[2]);
+ srcLoader3_16x8 = vld1q_s16(src[3]);
+ srcLoader4_16x8 = vld1q_s16(src[4]);
+ srcLoader5_16x8 = vld1q_s16(src[5]);
+ srcLoader6_16x8 = vld1q_s16(src[6]);
+ srcLoader7_16x8 = vld1q_s16(src[7]);
+ srcLoader8_16x8 = vld1q_s16(src[8]);
+
+ for (i = 0; i < dstW; i += 8) {
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0);
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1);
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8); //load for next loop iteration, hides latency
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader3_16x8), filterVecA, 3);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader3_16x8), filterVecA, 3);
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8);
+ srcLoader3_16x8 = vld1q_s16(src[3] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader4_16x8), filterVecB, 0);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader4_16x8), filterVecB, 0);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader5_16x8), filterVecB, 1);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader5_16x8), filterVecB, 1);
+ srcLoader4_16x8 = vld1q_s16(src[4] + i + 8);
+ srcLoader5_16x8 = vld1q_s16(src[5] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader6_16x8), filterVecB, 2);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader6_16x8), filterVecB, 2);
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader7_16x8), filterVecB, 3);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader7_16x8), filterVecB, 3);
+ srcLoader6_16x8 = vld1q_s16(src[6] + i + 8);
+ srcLoader7_16x8 = vld1q_s16(src[7] + i + 8);
+
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader8_16x8), filterVecC, 0);
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader8_16x8), filterVecC, 0);
+ srcLoader8_16x8 = vld1q_s16(src[8] + i + 8);
+
+ valsA = vshrq_n_s32(valsA, 16);
+ valsB = vshrq_n_s32(valsB, 16);
+ destA_t = vshrn_n_s32(valsA, 3);
+ destB_t = vshrn_n_s32(valsB, 3);
+ dest_t = vcombine_s16(destA_t, destB_t);
+ destVec = vqmovun_s16(dest_t);
+ vst1_u8(dest + i, destVec);
+ }
+}
+
+static void yuv2planeX_8_neon(const int16_t *filter, int filterSize, const int16_t **src,
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset)
+{
+ static int alreadyPrinted = 0;
+ int optFuncFound = 1;
+
+ if (filterSize == 1)
+ yuv2planeX_8_neon_template(filter, 1, src, dest, dstW, dither, offset);
+ else if (filterSize == 2)
+ yuv2planeX_8_neon_template(filter, 2, src, dest, dstW, dither, offset);
+ else if (filterSize == 3)
+ yuv2planeX_8_neon_v3(filter, src, dest, dstW, dither, offset);
+ else if (filterSize == 4)
+ yuv2planeX_8_neon_template(filter, 4, src, dest, dstW, dither, offset);
+ else if (filterSize == 5)
+ yuv2planeX_8_neon_template(filter, 5, src, dest, dstW, dither, offset);
+ else if (filterSize == 6)
+ yuv2planeX_8_neon_v6(filter, src, dest, dstW, dither, offset);
+ else if (filterSize == 7)
+ yuv2planeX_8_neon_template(filter, 7, src, dest, dstW, dither, offset);
+ else if (filterSize == 8)
+ yuv2planeX_8_neon_template(filter, 8, src, dest, dstW, dither, offset);
+ else if (filterSize == 9)
+ yuv2planeX_8_neon_v9(filter, src, dest, dstW, dither, offset);
+ else if (filterSize == 10)
+ yuv2planeX_8_neon_template(filter, 10, src, dest, dstW, dither, offset);
+ else if (filterSize == 11)
+ yuv2planeX_8_neon_template(filter, 11, src, dest, dstW, dither, offset);
+ else if (filterSize == 12)
+ yuv2planeX_8_neon_template(filter, 12, src, dest, dstW, dither, offset);
+ else
+ optFuncFound = 0;
+
+ if (! optFuncFound)
+ yuv2planeX_8_c(filter, filterSize, src, dest, dstW, dither, offset);
+
+ if (!alreadyPrinted) {
+ if (optFuncFound)
+ fprintf(stderr, "filtersize supported in yuv2planeX_8_neon! (%d)\n", filterSize);
+ else
+ fprintf(stderr, "filtersize not supported in yuv2planeX_8_neon! (%d)\n", filterSize);
+ alreadyPrinted = 1;
+ }
+}
+
+void ff_sws_init_output_funcs_neon(SwsContext *c, yuv2planarX_fn *yuv2planeX)
+{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ if (is16BPS(dstFormat))
+ fprintf(stderr, "Not using yuv2planeX_8_neon is16BPS \n");
+ else if (is9_OR_10BPS(dstFormat))
+ fprintf(stderr, "Not using yuv2planeX_8_neon is9_OR_10BPS \n");
+ else
+ *yuv2planeX = yuv2planeX_8_neon;
+}
+
diff --git a/libswscale/neon/swscale_neon.c b/libswscale/neon/swscale_neon.c
new file mode 100644
index 0000000..2114143
--- /dev/null
+++ b/libswscale/neon/swscale_neon.c
@@ -0,0 +1,892 @@
+/*
+ * Neon-enhanced hScale
+ *
+ * based on the equivalent C code in libswscale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//Arm reference
+//http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHJBEFE.html
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include <arm_neon.h>
+#include <stdio.h>
+
+#if ARCH_ARM
+#define PLD_NEXT(pointer) \
+ __asm__( "pld [%0, #256] \n\t" : : "r"(pointer));
+#elif ARCH_AARCH64
+#define PLD_NEXT(pointer) \
+ __asm__( "prfm pldl1keep, %a0 \n\t" : : "p"(pointer + 256));
+#else
+#warning __FILE__ should be used with arm/aarch64
+#define PLD_NEXT(pointer)
+#endif
+
+#define FFMIN_ARM(a,b) a
+
+// Optimized Filter Functions
+//***********************************************************************************************************************
+//***********************************************************************************************************************
+//***********************************************************************************************************************
+
+//Horizontal 3 tap filter 1080 -> 720p
+static void h3_1080p_1280x720_neon_bilinear(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src)
+
+{
+
+ int16x8_t filter_a;
+ uint8x16_t in_8x16;
+ int srcPos;
+ int i = 0;
+ int left_val = 0;
+ int val = 0;
+ int val2 = 0;
+ int16x4_t unpack;
+ uint8x8_t in;
+ uint8x8_t lo;
+ uint8x8_t hi;
+ int16x4_t out_16x4;
+ int16x8_t src_unpack;
+ int32x4_t accum;
+ int32x2_t aggregate;
+ int16x4_t filter_lo;
+ int16x4_t filter_hi;
+
+ //set up the bilinear filter
+ filter_a = vdupq_n_s16(0);
+ filter_a = vsetq_lane_s16(5461, filter_a, 0);
+ filter_a = vsetq_lane_s16(9103, filter_a, 1);
+ filter_a = vsetq_lane_s16(1820, filter_a, 2);
+ filter_a = vsetq_lane_s16(1820, filter_a, 5);
+ filter_a = vsetq_lane_s16(9103, filter_a, 6);
+ filter_a = vsetq_lane_s16(5461, filter_a, 7);
+ filter_lo = vget_low_s16(filter_a);
+ filter_hi = vget_high_s16(filter_a);
+
+ //right edge
+ left_val = ((int) src[0]) * 10240;
+ left_val += ((int) src[1]) * 6144;
+ left_val >>= 7;
+
+ //******************************************************************************************
+ //******************************************************************************************
+ //******************************************************************************************
+ //this is WEIRD OFFSET LOADS but the unalignement get fine performance
+ srcPos = -5;
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x1-x16
+
+ // middle values ----------------------------------------------------------------------------
+ for (i = 0; i < dstW - 1; i += 4) {
+
+ srcPos += 6;
+ lo = vget_low_u8(in_8x16); //x1-x8
+ hi = vget_high_u8(in_8x16); //x9-x16
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x7-x23 for next loop iteration
+
+ //---------------------------------------------------------------
+ src_unpack = vmovl_u8(lo);
+ unpack = vget_low_s16(src_unpack); //promote to 16-bits
+
+ accum = vmull_s16(filter_lo, unpack);
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0) >> 7;
+
+ accum = vmull_s16(filter_hi, unpack);
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val2 = vget_lane_s32(aggregate, 0) >> 7;
+
+ //store the values
+ out_16x4 = vset_lane_s16(left_val, out_16x4, 0);
+ out_16x4 = vset_lane_s16(val, out_16x4, 1);
+ out_16x4 = vset_lane_s16(val2, out_16x4, 2);
+
+ //next 2 values -------------------------------------------------------------------------
+ in = vext_u8(lo, hi, 3); //x4-x11
+ src_unpack = vmovl_u8(in);
+ unpack = vget_low_s16(src_unpack); //promote to 16-bits
+
+ accum = vmull_s16(filter_lo, unpack);
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0) >> 7;
+
+ accum = vmull_s16(filter_hi, unpack);
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ left_val = vget_lane_s32(aggregate, 0) >> 7; //for next loop iteration ...
+
+ //store the values out
+ out_16x4 = vset_lane_s16(val, out_16x4, 3);
+ vst1_s16(&dst[i], out_16x4); //store out 64-bits
+
+ }
+
+ //******************************************************************************************
+ //******************************************************************************************
+ //******************************************************************************************
+ //right edge ----------------------------------------------------------
+ i = dstW - 3;
+
+ //right edge
+ val = ((int) src[srcPos + 0]) * 5461;
+ val += ((int) src[srcPos + 1]) * 9103;
+ val += ((int) src[srcPos + 2]) * 1820;
+ dst[i] = val >> 7;
+
+ val = ((int) src[srcPos + 1]) * 1820;
+ val += ((int) src[srcPos + 2]) * 9103;
+ val += ((int) src[srcPos + 3]) * 5461;
+ dst[i + 1] = val >> 7;
+
+ val = ((int) src[srcPos + 3]) * 5461;
+ val += ((int) src[srcPos + 4]) * 10923;
+ dst[i + 2] = val >> 7;
+
+}
+
+//Horizontal 6 tap filter 1080 -> 720p
+static void h6_1080p_1280x720_neon_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t *filterPos)
+{
+
+ int i;
+ int32_t val;
+ int32_t val2;
+ int srcPos;
+ int16x8_t filter_a;
+ int16x8_t filter_b;
+ uint8x16_t in_8x16 = vld1q_u8(src);
+
+ //Left edge filter ---------------------------------------------------
+ val = ((int) src[0]) * 11014;
+ val += ((int) src[1]) * 6280;
+ val += ((int) src[2]) * -758;
+ val += ((int) src[3]) * -152;
+ dst[0] = val >> 7;
+
+ val2 = ((int) src[0]) * -752;
+ val2 += ((int) src[1]) * 6223;
+ val2 += ((int) src[2]) * 10171;
+ val2 += ((int) src[3]) * 1554;
+ val2 += ((int) src[4]) * -812;
+ dst[1] = val2 >> 7;
+
+ //middle values -----------------------------------------------------
+ filter_a = vdupq_n_s16(0);
+ filter_b = vdupq_n_s16(0);
+ filter_a = vsetq_lane_s16(-819, filter_a, 1);
+ filter_a = vsetq_lane_s16(1567, filter_a, 2);
+ filter_a = vsetq_lane_s16(10266, filter_a, 3);
+ filter_a = vsetq_lane_s16(6280, filter_a, 4);
+ filter_a = vsetq_lane_s16(-758, filter_a, 5);
+ filter_a = vsetq_lane_s16(-152, filter_a, 6);
+ filter_b = vsetq_lane_s16(-152, filter_b, 2);
+ filter_b = vsetq_lane_s16(-758, filter_b, 3);
+ filter_b = vsetq_lane_s16(6280, filter_b, 4);
+ filter_b = vsetq_lane_s16(10266, filter_b, 5);
+ filter_b = vsetq_lane_s16(1567, filter_b, 6);
+ filter_b = vsetq_lane_s16(-819, filter_b, 7);
+
+ srcPos = 0;
+ for (i = 0; i < dstW - 4; i += 4) {
+
+ uint8x8_t in;
+ uint8x8_t lo;
+ uint8x8_t hi;
+ int16x4_t out_16x4;
+ int16x8_t src_unpack;
+ int32x4_t accum;
+ int32x2_t aggregate;
+ int32x4_t accum2;
+ int32x2_t aggregate2;
+
+ //-------- loop 1
+ lo = vget_low_u8(in_8x16); //x0-x7
+ hi = vget_high_u8(in_8x16); //x8-x15
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x0-x15
+
+ //first filter
+ src_unpack = vmovl_u8(lo);
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(src_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(src_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0) >> 7;
+
+ //second filter
+ accum2 = vmull_s16(vget_low_s16(filter_b), vget_low_s16(src_unpack));
+ accum2 = vmlal_s16(accum2, vget_high_s16(filter_b), vget_high_s16(src_unpack));
+ aggregate2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
+ aggregate2 = vpadd_s32(aggregate2, aggregate2);
+ val2 = vget_lane_s32(aggregate2, 0) >> 7;
+
+ //store the values
+ out_16x4 = vset_lane_s16(val, out_16x4, 0);
+ out_16x4 = vset_lane_s16(val2, out_16x4, 1);
+
+ //-------- loop 2
+ in = vext_u8(lo, hi, 3); //x3-x10
+ src_unpack = vmovl_u8(in);
+
+ //first filter
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(src_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(src_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0) >> 7;
+
+ //second filter
+ accum2 = vmull_s16(vget_low_s16(filter_b), vget_low_s16(src_unpack));
+ accum2 = vmlal_s16(accum2, vget_high_s16(filter_b), vget_high_s16(src_unpack));
+ aggregate2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
+ aggregate2 = vpadd_s32(aggregate2, aggregate2);
+ val2 = vget_lane_s32(aggregate2, 0) >> 7;
+
+ //store the values
+ out_16x4 = vset_lane_s16(val, out_16x4, 2);
+ out_16x4 = vset_lane_s16(val2, out_16x4, 3);
+ vst1_s16(&dst[i + 2], out_16x4);
+
+ //increment the input pointer
+ srcPos += 6;
+ }
+
+ //right edge filter -----------------------------------------------------------
+ i = dstW - 2;
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 1]) * -819;
+ val += ((int) src[srcPos + 2]) * 1567;
+ val += ((int) src[srcPos + 3]) * 10266;
+ val += ((int) src[srcPos + 4]) * 6280;
+ val += ((int) src[srcPos + 5]) * -910;
+ dst[i] = val >> 7;
+
+ val2 = ((int) src[srcPos + 2]) * -152;
+ val2 += ((int) src[srcPos + 3]) * -758;
+ val2 += ((int) src[srcPos + 4]) * 6280;
+ val2 += ((int) src[srcPos + 5]) * 11014;
+ dst[i + 1] = val2 >> 7;
+
+}
+
+//Horizontal 8 tap filter 4k -> 1080p
+static void h8_3840x2160_1920x1080_c_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t *filterPos)
+{
+
+ int i;
+ int srcPos;
+ int val;
+
+ //left edge filters --------------------------------------------
+ val = ((int) src[0]) * 8192;
+ val += ((int) src[1]) * 7142;
+ val += ((int) src[2]) * 1972;
+ val += ((int) src[3]) * -692;
+ val += ((int) src[4]) * -230;
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ val = ((int) src[0]) * -922;
+ val += ((int) src[1]) * 1972;
+ val += ((int) src[2]) * 7142;
+ val += ((int) src[3]) * 7142;
+ val += ((int) src[4]) * 1972;
+ val += ((int) src[5]) * -692;
+ val += ((int) src[6]) * -230;
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ //middle values
+ srcPos = filterPos[2];
+ for (i = 2; i < dstW - 2; i++) {
+ val = (int) (((int) src[srcPos + 0]) + ((int) src[srcPos + 7])) * -230;
+ val += (int) (((int) src[srcPos + 1]) + ((int) src[srcPos + 6])) * -692;
+ val += (int) (((int) src[srcPos + 2]) + ((int) src[srcPos + 5])) * 1972;
+ val += (int) (((int) src[srcPos + 3]) + ((int) src[srcPos + 4])) * 7142;
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+ srcPos += 2;
+ }
+
+ //right edge filter
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 1]) * -230;
+ val += ((int) src[srcPos + 2]) * -692;
+ val += ((int) src[srcPos + 3]) * 1972;
+ val += ((int) src[srcPos + 4]) * 7142;
+ val += ((int) src[srcPos + 5]) * 7142;
+ val += ((int) src[srcPos + 6]) * 1972;
+ val += ((int) src[srcPos + 7]) * -922;
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ val = ((int) src[srcPos + 3]) * -230;
+ val += ((int) src[srcPos + 4]) * -692;
+ val += ((int) src[srcPos + 5]) * 1972;
+ val += ((int) src[srcPos + 6]) * 7142;
+ val += ((int) src[srcPos + 7]) * 8192;
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1);
+}
+
+//Horizontal 9 tap filter 1080p -> 480p (1920 -> 852)
+static void h9_1080p_852x480_neon_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int16_t * restrict filter,
+ const int32_t *filterPos)
+{
+ int i;
+ int val;
+ int16x8_t zero_16x8 = vdupq_n_s16(0);
+ uint8x8_t lo;
+ uint8x8_t hi;
+ int8x16_t in_8x16;
+ int16x8_t lo_unpack;
+ int16x8_t hi_unpack;
+ int32x4_t accum;
+ int32x2_t aggregate;
+ int16x8_t filter_a;
+ int16x8_t filter_b;
+ int j = 0;
+
+ //preload to hide load latency
+ in_8x16 = vld1q_u8(src);
+
+ for (i = 0; i < dstW; i++) {
+ int srcPos = filterPos[i + 1]; //load for next loop iteration
+
+ //load filter values
+ filter_a = vld1q_s16(&filter[j]);
+ filter_b = vsetq_lane_s16(filter[j + 8], zero_16x8, 0);
+
+ lo = vget_low_u8(in_8x16); //x0-x7
+ hi = vget_high_u8(in_8x16); //x8-x15
+ in_8x16 = vld1q_u8(&src[srcPos]); //x0-x15 load for next loop iteration
+
+ lo_unpack = vmovl_u8(lo);
+ hi_unpack = vmovl_u8(hi);
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_low_s16(filter_b), vget_low_s16(hi_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+ j += 9;
+ }
+}
+
+
+//Horizontal 11 tap filter //4K -> 720p and 1080p -> 480p (640x480) also 1080 -> 720p
+static void h11_1080p_640x480_neon_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t *filterPos)
+{
+ int i;
+ int32_t val;
+ int srcPos;
+ uint8x16_t in_8x16 = vld1q_u8(src);
+ uint8x8_t zero_8x8 = vdup_n_u8(0);
+ int16x8_t filter_a;
+
+ //left edge filters --------------------------------------------
+ val = ((int) src[0]) * 5461;
+ val += ((int) src[1]) * 5462;
+ val += ((int) src[2]) * 4288;
+ val += ((int) src[3]) * 1901;
+ val += ((int) src[5]) * -485;
+ val += ((int) src[6]) * -243;
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ val = ((int) src[0]) * -728;
+ val += ((int) src[2]) * 1901;
+ val += ((int) src[3]) * 4288;
+ val += ((int) src[4]) * 5462;
+ val += ((int) src[5]) * 4288;
+ val += ((int) src[6]) * 1901;
+ val += ((int) src[8]) * -485;
+ val += ((int) src[9]) * -243;
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ //middle values --------------------------------------------
+ filter_a = vdupq_n_s16(0);
+ filter_a = vsetq_lane_s16(-243, filter_a, 2);
+ filter_a = vsetq_lane_s16(-485, filter_a, 3);
+ filter_a = vsetq_lane_s16(1901, filter_a, 5);
+ filter_a = vsetq_lane_s16(4288, filter_a, 6);
+ filter_a = vsetq_lane_s16(5462, filter_a, 7);
+
+ srcPos = 0;
+ for (i = 0; i < dstW - 4; i += 2) {
+ uint8x8_t in_rev;
+ uint8x8_t lo;
+ uint8x8_t hi;
+ int32x4_t accum;
+ int32x2_t aggregate;
+ uint16x8_t tmp_16x8;
+ uint8x8_t tmp_8x8;
+
+ //load the 16 samples
+ lo = vget_low_u8(in_8x16); //x0-x7
+ hi = vget_high_u8(in_8x16); //x8-x15
+ in_8x16 = vld1q_u8(&src[srcPos + 6]);
+
+ //unroll 1
+ in_rev = vext_u8(zero_8x8, hi, 7); //x7-x15
+ in_rev = vrev64_u8(in_rev); //x15-x7
+
+ tmp_16x8 = vaddl_u8(lo, in_rev); //x2+x12|x3+x11| ....
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(tmp_16x8));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(tmp_16x8));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i + 2] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ //unroll 2
+ //shift the input over 3 bytes and do the same thing
+ tmp_8x8 = vext_u8(lo, hi, 3); // x3-x10
+ hi = vext_u8(hi, zero_8x8, 3); //x11-x15
+ lo = tmp_8x8;
+
+ in_rev = vext_u8(zero_8x8, hi, 7);
+ in_rev = vrev64_u8(in_rev);
+ tmp_16x8 = vaddl_u8(lo, in_rev);
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(tmp_16x8));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(tmp_16x8));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i + 3] = FFMIN(val >> 7, (1 << 15) - 1);
+ srcPos += 6;
+
+ }
+
+ //right edge values
+ i = dstW - 2;
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 1]) * -243;
+ val += ((int) src[srcPos + 2]) * -485;
+ val += ((int) src[srcPos + 4]) * 1901;
+ val += ((int) src[srcPos + 5]) * 4288;
+ val += ((int) src[srcPos + 6]) * 5462;
+ val += ((int) src[srcPos + 7]) * 4288;
+ val += ((int) src[srcPos + 8]) * 1901;
+ val += ((int) src[srcPos + 10]) * -728;
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ srcPos = filterPos[i + 1];
+ val = ((int) src[srcPos + 4]) * -243;
+ val += ((int) src[srcPos + 5]) * -485;
+ val += ((int) src[srcPos + 7]) * 1901;
+ val += ((int) src[srcPos + 8]) * 4288;
+ val += ((int) src[srcPos + 9]) * 5462;
+ val += ((int) src[srcPos + 10]) * 5461;
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1);
+
+}
+
+//Horizontal 11 tap filter // 1080p -> 480p (720x480) also 1080 -> 720p
+static void h11_1080p_720x480_neon_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t * restrict filterPos)
+{
+
+ int i;
+ int j = 0;
+ int val = 0;
+ int16x8_t filter_a;
+ int16x8_t filter_aa;
+ int16x8_t filter_b;
+ int16x8_t filter_bb;
+ int16x8_t filter_c;
+ int16x8_t filter_cc;
+ uint8x16_t in1_8x16;
+ uint8x16_t in2_8x16;
+
+ //left edge filter
+ int srcPos = filterPos[0];
+ val = ((int) src[srcPos + 0]) * 6110;
+ val += ((int) src[srcPos + 1]) * 6081;
+ val += ((int) src[srcPos + 2]) * 4037;
+ val += ((int) src[srcPos + 3]) * 1022;
+ val += ((int) src[srcPos + 4]) * -456;
+ val += ((int) src[srcPos + 5]) * -397;
+ val += ((int) src[srcPos + 6]) * -13;
+ dst[0] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ srcPos = filterPos[1];
+ val = ((int) src[srcPos + 0]) * -794;
+ val += ((int) src[srcPos + 1]) * 272;
+ val += ((int) src[srcPos + 2]) * 3017;
+ val += ((int) src[srcPos + 3]) * 5697;
+ val += ((int) src[srcPos + 4]) * 5697;
+ val += ((int) src[srcPos + 5]) * 3017;
+ val += ((int) src[srcPos + 6]) * 272;
+ val += ((int) src[srcPos + 7]) * -546;
+ val += ((int) src[srcPos + 8]) * -248;
+ dst[1] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ //set the filter taps
+ filter_a = vdupq_n_s16(0);
+ filter_a = vsetq_lane_s16(-13, filter_a, 1);
+ filter_a = vsetq_lane_s16(-395, filter_a, 2);
+ filter_a = vsetq_lane_s16(-457, filter_a, 3);
+ filter_a = vsetq_lane_s16(1013, filter_a, 4);
+ filter_a = vsetq_lane_s16(4026, filter_a, 5);
+ filter_a = vsetq_lane_s16(6079, filter_a, 6);
+ filter_a = vsetq_lane_s16(4969, filter_a, 7);
+ filter_aa = vdupq_n_s16(0);
+ filter_aa = vsetq_lane_s16(1978, filter_aa, 0);
+ filter_aa = vsetq_lane_s16(-199, filter_aa, 1);
+ filter_aa = vsetq_lane_s16(-511, filter_aa, 2);
+ filter_aa = vsetq_lane_s16(-106, filter_aa, 3);
+
+ filter_b = vdupq_n_s16(0);
+ filter_b = vsetq_lane_s16(-105, filter_b, 0);
+ filter_b = vsetq_lane_s16(-509, filter_b, 1);
+ filter_b = vsetq_lane_s16(-203, filter_b, 2);
+ filter_b = vsetq_lane_s16(1964, filter_b, 3);
+ filter_b = vsetq_lane_s16(4958, filter_b, 4);
+ filter_b = vsetq_lane_s16(6081, filter_b, 5);
+ filter_b = vsetq_lane_s16(4039, filter_b, 6);
+ filter_b = vsetq_lane_s16(1025, filter_b, 7);
+ filter_bb = vdupq_n_s16(0);
+ filter_bb = vsetq_lane_s16(-451, filter_bb, 0);
+ filter_bb = vsetq_lane_s16(-396, filter_bb, 1);
+ filter_bb = vsetq_lane_s16(-14, filter_bb, 2);
+
+ filter_c = vdupq_n_s16(0);
+ filter_c = vsetq_lane_s16(-248, filter_c, 3);
+ filter_c = vsetq_lane_s16(-249, filter_c, 4);
+ filter_cc = vdupq_n_s16(0);
+ filter_cc = vsetq_lane_s16(-546, filter_cc, 0);
+ filter_cc = vsetq_lane_s16(271, filter_cc, 1);
+ filter_cc = vsetq_lane_s16(3013, filter_cc, 2);
+ filter_cc = vsetq_lane_s16(5695, filter_cc, 3);
+ filter_cc = vsetq_lane_s16(5699, filter_cc, 4);
+ filter_cc = vsetq_lane_s16(3020, filter_cc, 5);
+ filter_cc = vsetq_lane_s16(274, filter_cc, 6);
+ filter_cc = vsetq_lane_s16(-545, filter_cc, 7);
+
+ //preloads
+ srcPos = 0;
+ in1_8x16 = vld1q_u8(&src[srcPos]); //x0->x15
+ in2_8x16 = vld1q_u8(&src[srcPos + 16]); //x16->x31
+
+ for (i = 2; i < dstW - 3; i += 3) {
+ uint8x8_t lo;
+ uint8x8_t lo2;
+ uint8x8_t hi;
+ int16x8_t lo_unpack;
+ int16x8_t hi_unpack;
+ int16x8_t lo2_unpack;
+ int32x2_t aggregate;
+ int32x4_t accum;
+
+ //separate them
+ lo = vget_low_u8(in1_8x16); //x0-x7
+ hi = vget_high_u8(in1_8x16); //x8-x15
+ lo2 = vget_low_u8(in2_8x16); //x16-x24
+
+ //load for next loop iteration -< overlap here if we unroll the loop
+ //could get rid of the extra load.
+ in1_8x16 = vcombine_u8(hi, lo2); // vld1q_u8(&src[srcPos+8]); //x0->x16
+ in2_8x16 = vld1q_u8(&src[srcPos + 24]); //x0->x16
+
+ //unpack and promote
+ lo_unpack = vmovl_u8(lo); // x0-X7 -> sint16_t
+ hi_unpack = vmovl_u8(hi); // x8-X15 -> sint16_t
+ lo2_unpack = vmovl_u8(lo2); // x16-X24 -> sint16_t
+
+ //unroll 1
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_low_s16(filter_aa), vget_low_s16(hi_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ //unroll 2
+ accum = vmull_s16(vget_low_s16(filter_b), vget_high_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_b), vget_low_s16(hi_unpack));
+ accum = vmlal_s16(accum, vget_low_s16(filter_bb), vget_high_s16(hi_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i + 1] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ //unroll 3
+ accum = vmull_s16(vget_low_s16(filter_c), vget_high_s16(lo_unpack));
+ accum = vmlal_s16(accum, vget_low_s16(filter_cc), vget_low_s16(hi_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_cc), vget_high_s16(hi_unpack));
+ accum = vmlal_s16(accum, vget_high_s16(filter_c), vget_low_s16(lo2_unpack));
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ aggregate = vpadd_s32(aggregate, aggregate);
+ val = vget_lane_s32(aggregate, 0);
+ dst[i + 2] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+ srcPos += 8;
+ }
+
+ //right edge filters
+ i = dstW - 3;
+ srcPos = filterPos[i];
+ j = i * 11;
+ val = ((int) src[srcPos + 0]) * 104;
+ val += ((int) src[srcPos + 1]) * -509;
+ val += ((int) src[srcPos + 2]) * -206;
+ val += ((int) src[srcPos + 3]) * 1956;
+ val += ((int) src[srcPos + 4]) * 4951;
+ val += ((int) src[srcPos + 5]) * 6083;
+ val += ((int) src[srcPos + 6]) * 4047;
+ val += ((int) src[srcPos + 7]) * 1032;
+ val += ((int) src[srcPos + 8]) * -454;
+ val += ((int) src[srcPos + 9]) * -398;
+ val += ((int) src[srcPos + 10]) * -14;
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ i++;
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 2]) * -246;
+ val += ((int) src[srcPos + 3]) * -547;
+ val += ((int) src[srcPos + 4]) * 266;
+ val += ((int) src[srcPos + 5]) * 3005;
+ val += ((int) src[srcPos + 6]) * 5691;
+ val += ((int) src[srcPos + 7]) * 5703;
+ val += ((int) src[srcPos + 8]) * 3028;
+ val += ((int) src[srcPos + 9]) * 279;
+ val += ((int) src[srcPos + 10]) * -795;
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+ j += 11;
+ i++;
+
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 4]) * -13;
+ val += ((int) src[srcPos + 5]) * -396;
+ val += ((int) src[srcPos + 6]) * -456;
+ val += ((int) src[srcPos + 7]) * 1017;
+ val += ((int) src[srcPos + 8]) * 4031;
+ val += ((int) src[srcPos + 9]) * 6080;
+ val += ((int) src[srcPos + 10]) * 6121;
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+}
+
+//Horizontal 11 tap filter 4k -> 720p
+static void h11_3840x2160_1280x720_neon_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t *filterPos)
+{
+ h11_1080p_640x480_neon_bicubic(dst, dstW, src, filterPos);
+}
+
+//Horizontal 24 tap filter 4k -> 480p
+static void h24_3840x2160_640x480_c_bicubic(int16_t * restrict dst, int dstW,
+ const uint8_t * restrict src, const int32_t *filterPos)
+{
+ int i;
+ int val;
+ int srcPos;
+
+ //left edge filter
+ val = ((int) src[0]) * 3124;
+ val += ((int) src[1]) * 2381;
+ val += ((int) src[2]) * 2687;
+ val += ((int) src[3]) * 2687;
+ val += ((int) src[4]) * 2381;
+ val += ((int) src[5]) * 1870;
+ val += ((int) src[6]) * 1259;
+ val += ((int) src[7]) * 657;
+ val += ((int) src[8]) * 169;
+ val += ((int) src[9]) * -115;
+ val += ((int) src[10]) * -231;
+ val += ((int) src[11]) * -232;
+ val += ((int) src[12]) * -166;
+ val += ((int) src[13]) * -77;
+ val += ((int) src[14]) * -10;
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ val = ((int) src[0]) * -485;
+ val += ((int) src[1]) * -231;
+ val += ((int) src[2]) * -115;
+ val += ((int) src[3]) * 169;
+ val += ((int) src[4]) * 657;
+ val += ((int) src[5]) * 1259;
+ val += ((int) src[6]) * 1870;
+ val += ((int) src[7]) * 2381;
+ val += ((int) src[8]) * 2687;
+ val += ((int) src[9]) * 2687;
+ val += ((int) src[10]) * 2381;
+ val += ((int) src[11]) * 1870;
+ val += ((int) src[12]) * 1259;
+ val += ((int) src[13]) * 657;
+ val += ((int) src[14]) * 169;
+ val += ((int) src[15]) * -115;
+ val += ((int) src[16]) * -231;
+ val += ((int) src[17]) * -232;
+ val += ((int) src[18]) * -166;
+ val += ((int) src[19]) * -77;
+ val += ((int) src[20]) * -10;
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ //middle values
+ srcPos = filterPos[2];
+ for (i = 2; i < dstW - 2; i++) {
+ val = (int) (((int) src[srcPos + 0]) + ((int) src[srcPos + 23])) * -10;
+ val += (int) (((int) src[srcPos + 1]) + ((int) src[srcPos + 22])) * -77;
+ val += (int) (((int) src[srcPos + 2]) + ((int) src[srcPos + 21])) * -166;
+ val += (int) (((int) src[srcPos + 3]) + ((int) src[srcPos + 20])) * -232;
+ val += (int) (((int) src[srcPos + 4]) + ((int) src[srcPos + 19])) * -231;
+ val += (int) (((int) src[srcPos + 5]) + ((int) src[srcPos + 18])) * -115;
+ val += (int) (((int) src[srcPos + 6]) + ((int) src[srcPos + 17])) * 169;
+ val += (int) (((int) src[srcPos + 7]) + ((int) src[srcPos + 16])) * 657;
+ val += (int) (((int) src[srcPos + 8]) + ((int) src[srcPos + 15])) * 1259;
+ val += (int) (((int) src[srcPos + 9]) + ((int) src[srcPos + 14])) * 1870;
+ val += (int) (((int) src[srcPos + 10]) + ((int) src[srcPos + 13])) * 2381;
+ val += (int) (((int) src[srcPos + 11]) + ((int) src[srcPos + 12])) * 2687;
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+ srcPos += 6;
+ }
+
+ //right edge filter
+ srcPos = filterPos[i];
+ val = ((int) src[srcPos + 3]) * -10;
+ val += ((int) src[srcPos + 4]) * -77;
+ val += ((int) src[srcPos + 5]) * -166;
+ val += ((int) src[srcPos + 6]) * -232;
+ val += ((int) src[srcPos + 7]) * -231;
+ val += ((int) src[srcPos + 8]) * -115;
+ val += ((int) src[srcPos + 9]) * 169;
+ val += ((int) src[srcPos + 10]) * 657;
+ val += ((int) src[srcPos + 11]) * 1259;
+ val += ((int) src[srcPos + 12]) * 1870;
+ val += ((int) src[srcPos + 13]) * 2381;
+ val += ((int) src[srcPos + 14]) * 2687;
+ val += ((int) src[srcPos + 15]) * 2687;
+ val += ((int) src[srcPos + 16]) * 2381;
+ val += ((int) src[srcPos + 17]) * 1870;
+ val += ((int) src[srcPos + 18]) * 1259;
+ val += ((int) src[srcPos + 19]) * 657;
+ val += ((int) src[srcPos + 20]) * 169;
+ val += ((int) src[srcPos + 21]) * -115;
+ val += ((int) src[srcPos + 22]) * -231;
+ val += ((int) src[srcPos + 23]) * -485;
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+
+ val = ((int) src[srcPos + 9]) * -10;
+ val += ((int) src[srcPos + 10]) * -77;
+ val += ((int) src[srcPos + 11]) * -166;
+ val += ((int) src[srcPos + 12]) * -232;
+ val += ((int) src[srcPos + 13]) * -231;
+ val += ((int) src[srcPos + 14]) * -115;
+ val += ((int) src[srcPos + 15]) * 169;
+ val += ((int) src[srcPos + 16]) * 657;
+ val += ((int) src[srcPos + 17]) * 1259;
+ val += ((int) src[srcPos + 18]) * 1870;
+ val += ((int) src[srcPos + 19]) * 2381;
+ val += ((int) src[srcPos + 20]) * 2687;
+ val += ((int) src[srcPos + 21]) * 2687;
+ val += ((int) src[srcPos + 22]) * 2381;
+ val += ((int) src[srcPos + 23]) * 3124;
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1);
+}
+
+//******************************************************************************************************************
+//******************************************************************************************************************
+//******************************************************************************************************************
+static void hScale8To15_neon(SwsContext *c, int16_t *dst, int dstW, const uint8_t * src,
+ const int16_t * filter, const int32_t *filterPos, int filterSize)
+{
+
+ //use the original filter if we are not using the optimized bicubic methods
+ if (!(c->flags & SWS_BICUBIC)) {
+
+ //should maybe put a check for bilinear ....
+ if (((c->srcW == 1920 && dstW == 1280) || (c->srcW == 1920 && dstW == 640))
+ && (filterSize == 3) && (c->flags & SWS_BILINEAR)) {
+ h3_1080p_1280x720_neon_bilinear(dst, dstW, src);
+ } else {
+ //original code default
+ int i;
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++) {
+ val += ((int) src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+
+ }
+ }
+
+ } else { //use the bicubic optimized methods ----------------------------------------
+
+ //this will prefetch 2 chroma lines and 1 luma line
+ for (int i = 0; i < c->srcW; i += 64)
+ PLD_NEXT(&src[i]);
+
+ // c->srcW is luma width : dstW can be either luma or chroma width
+ if (((c->srcW == 1920 && dstW == 1280) || (c->srcW == 1920 && dstW == 640))
+ && (filterSize == 6)) {
+ h6_1080p_1280x720_neon_bicubic(dst, dstW, src, filterPos);
+ } else if (((c->srcW == 1920 && dstW == 852) || (c->srcW == 1920 && dstW == 426))
+ && (filterSize == 9)) {
+ h9_1080p_852x480_neon_bicubic(dst, dstW, src, filter, filterPos);
+ } else if (((c->srcW == 3840 && dstW == 1280) || (c->srcW == 3840 && dstW == 640))
+ && (filterSize == 11)) {
+ h11_3840x2160_1280x720_neon_bicubic(dst, dstW, src, filterPos);
+ } else if (((c->srcW == 1920 && dstW == 640) || (c->srcW == 1920 && dstW == 320))
+ && (filterSize == 11)) {
+ h11_1080p_640x480_neon_bicubic(dst, dstW, src, filterPos);
+ } else if (((c->srcW == 1920 && dstW == 720) || (c->srcW == 1920 && dstW == 360))
+ && (filterSize == 11)) {
+ h11_1080p_720x480_neon_bicubic(dst, dstW, src, filterPos);
+ } else if (((c->srcW == 3840 && dstW == 640) || (c->srcW == 3840 && dstW == 320))
+ && (filterSize == 24)) {
+ h24_3840x2160_640x480_c_bicubic(dst, dstW, src, filterPos);
+ } else if (((c->srcW == 3840 && dstW == 1920) || (c->srcW == 3840 && dstW == 960))
+ && (filterSize == 8)) {
+ h8_3840x2160_1920x1080_c_bicubic(dst, dstW, src, filterPos);
+ } else {
+ //original code default
+ int i;
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++) {
+ val += ((int) src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1);
+ }
+ }
+ } //else
+} //function
+
+av_cold void ff_sws_init_swscale_neon(SwsContext *c)
+{
+ if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+ c->hyScale = c->hcScale = hScale8To15_neon;
+ } else {
+ fprintf(stderr, "Not using hScale8To15_neon c->srcBpc: %d, c->dstBpc:%d\n",
+ c->srcBpc, c->dstBpc);
+ }
+}
+
diff --git a/libswscale/output.c b/libswscale/output.c
index 4b70626..7865751 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -250,7 +250,7 @@ yuv2NBPS(14, LE, 0, 10, int16_t)
yuv2NBPS(16, BE, 1, 16, int32_t)
yuv2NBPS(16, LE, 0, 16, int32_t)
-static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
+void yuv2planeX_8_c(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 1769348..a722957 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -676,6 +676,9 @@ static int swscale(SwsContext *c, const uint8_t *src[],
* this array's tail */
ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
&yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
+ if (HAVE_INTRINSICS_NEON)
+ ff_sws_init_output_funcs_neon(c, &yuv2planeX);
+
use_mmx_vfilter= 0;
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
@@ -856,6 +859,9 @@ static av_cold void sws_init_swscale(SwsContext *c)
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+ if (HAVE_INTRINSICS_NEON)
+ ff_sws_init_output_funcs_neon(c, &c->yuv2planeX);
+
ff_sws_init_input_funcs(c);
@@ -889,6 +895,8 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
ff_sws_init_swscale_ppc(c);
if (ARCH_X86)
ff_sws_init_swscale_x86(c);
+ if (HAVE_INTRINSICS_NEON)
+ ff_sws_init_swscale_neon(c);
return swscale;
}
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a53fdc4..c81f481 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -117,6 +117,12 @@ typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
+/**
+ * Default/generic c-implementation of yuv2planarX_fn for 8bPS
+ */
+void yuv2planeX_8_c(const int16_t *filter, int filterSize, const int16_t **src,
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset);
+
/**
* Write one line of horizontally scaled chroma to interleaved output
@@ -891,8 +897,10 @@ void ff_sws_init_output_funcs(SwsContext *c,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
+void ff_sws_init_output_funcs_neon(SwsContext *c, yuv2planarX_fn *yuv2planeX);
void ff_sws_init_swscale_ppc(SwsContext *c);
void ff_sws_init_swscale_x86(SwsContext *c);
+void ff_sws_init_swscale_neon(SwsContext *c);
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc);
--
1.7.9.5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment