Created
February 17, 2016 14:54
-
-
Save rcombs/541a7715a6213f71f91a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 69503a3bc2874fc942ba237d5c437032eea86774 Mon Sep 17 00:00:00 2001 | |
From: Tsahee Zidenberg <tsahee@annapurnalabs.com> | |
Date: Thu, 5 Nov 2015 15:07:17 +0200 | |
Subject: [PATCH] libswscale: add neon implementations | |
This adds neon optimization (for both arm and aarch64) to libswscale. | |
--- | |
libswscale/neon/Makefile | 4 + | |
libswscale/neon/output_neon.c | 458 +++++++++++++++++++++ | |
libswscale/neon/swscale_neon.c | 892 ++++++++++++++++++++++++++++++++++++++++ | |
libswscale/output.c | 2 +- | |
libswscale/swscale.c | 8 + | |
libswscale/swscale_internal.h | 8 + | |
6 files changed, 1371 insertions(+), 1 deletion(-) | |
create mode 100644 libswscale/neon/Makefile | |
create mode 100644 libswscale/neon/output_neon.c | |
create mode 100644 libswscale/neon/swscale_neon.c | |
diff --git a/libswscale/neon/Makefile b/libswscale/neon/Makefile | |
new file mode 100644 | |
index 0000000..d667c6d | |
--- /dev/null | |
+++ b/libswscale/neon/Makefile | |
@@ -0,0 +1,4 @@ | |
+OBJS += neon/swscale_neon.o | |
+OBJS += neon/output_neon.o | |
+ | |
+CFLAGS += -flax-vector-conversions | |
diff --git a/libswscale/neon/output_neon.c b/libswscale/neon/output_neon.c | |
new file mode 100644 | |
index 0000000..5692b67 | |
--- /dev/null | |
+++ b/libswscale/neon/output_neon.c | |
@@ -0,0 +1,458 @@ | |
+/* | |
+ * Neon-enhanced yuv2planeX | |
+ * | |
+ * based on the equivalent C code in libswscale | |
+ * | |
+ * This file is part of FFmpeg. | |
+ * | |
+ * FFmpeg is free software; you can redistribute it and/or | |
+ * modify it under the terms of the GNU Lesser General Public | |
+ * License as published by the Free Software Foundation; either | |
+ * version 2.1 of the License, or (at your option) any later version. | |
+ * | |
+ * FFmpeg is distributed in the hope that it will be useful, | |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+ * Lesser General Public License for more details. | |
+ * | |
+ * You should have received a copy of the GNU Lesser General Public | |
+ * License along with FFmpeg; if not, write to the Free Software | |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+ */ | |
+ | |
+//Arm reference | |
+//http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHJBEFE.html | |
+#include <inttypes.h> | |
+ | |
+#include "config.h" | |
+#include "libswscale/swscale.h" | |
+#include "libswscale/swscale_internal.h" | |
+#include "libavutil/attributes.h" | |
+#include "libavutil/cpu.h" | |
+#include <arm_neon.h> | |
+#include <stdio.h> | |
+ | |
+// almost-generic (filtersizes up to 12) neon optimized function, | |
+// it has many conditions on filtersize. inlining it with a constant filtersize | |
+// should be able to remove all branches except for the for-loop | |
+static inline void yuv2planeX_8_neon_template(const int16_t *filter, int filterSize, | |
+ const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) | |
+{ | |
+ int i; | |
+ int32x4_t valsA, valsB; | |
+ int16x4_t destA_t, destB_t; | |
+ int16x8_t dest_t; | |
+ uint8x8_t destVec; | |
+ int32x4_t ditherA, ditherB; | |
+ int16x4_t srcLoader; | |
+ int16x4_t filterVecA, filterVecB, filterVecC; | |
+ | |
+ ditherA = vdupq_n_u32(0); | |
+ ditherB = vdupq_n_u32(0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3); | |
+ | |
+ filterVecA = vld1_s16(filter); | |
+ if (filterSize > 4) | |
+ filterVecB = vld1_s16(filter + 4); | |
+ if (filterSize > 8) | |
+ filterVecC = vld1_s16(filter + 8); | |
+ | |
+ if (filterSize < 12) | |
+ filterVecC = vset_lane_s16(0, filterVecC, 3); | |
+ if (filterSize < 11) | |
+ filterVecC = vset_lane_s16(0, filterVecC, 2); | |
+ if (filterSize < 10) | |
+ filterVecC = vset_lane_s16(0, filterVecC, 1); | |
+ if (filterSize < 9) | |
+ filterVecC = vset_lane_s16(0, filterVecC, 0); | |
+ if (filterSize < 8) | |
+ filterVecB = vset_lane_s16(0, filterVecB, 3); | |
+ if (filterSize < 7) | |
+ filterVecB = vset_lane_s16(0, filterVecB, 2); | |
+ if (filterSize < 6) | |
+ filterVecB = vset_lane_s16(0, filterVecB, 1); | |
+ if (filterSize < 5) | |
+ filterVecB = vset_lane_s16(0, filterVecB, 0); | |
+ if (filterSize < 4) | |
+ filterVecA = vset_lane_s16(0, filterVecA, 3); | |
+ if (filterSize < 3) | |
+ filterVecA = vset_lane_s16(0, filterVecA, 2); | |
+ if (filterSize < 2) | |
+ filterVecA = vset_lane_s16(0, filterVecA, 1); | |
+ | |
+ for (i = 0; i < dstW; i += 8) { | |
+ valsA = ditherA; | |
+ valsB = ditherB; | |
+ srcLoader = vld1_s16(src[0] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 0); | |
+ srcLoader = vld1_s16(src[0] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 0); | |
+ if (filterSize > 1) { | |
+ srcLoader = vld1_s16(src[1] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 1); | |
+ srcLoader = vld1_s16(src[1] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 1); | |
+ } | |
+ if (filterSize > 2) { | |
+ srcLoader = vld1_s16(src[2] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 2); | |
+ srcLoader = vld1_s16(src[2] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 2); | |
+ } | |
+ if (filterSize > 3) { | |
+ srcLoader = vld1_s16(src[3] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecA, 3); | |
+ srcLoader = vld1_s16(src[3] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecA, 3); | |
+ } | |
+ if (filterSize > 4) { | |
+ srcLoader = vld1_s16(src[4] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 0); | |
+ srcLoader = vld1_s16(src[4] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 0); | |
+ } | |
+ if (filterSize > 5) { | |
+ srcLoader = vld1_s16(src[5] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 1); | |
+ srcLoader = vld1_s16(src[5] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 1); | |
+ } | |
+ if (filterSize > 6) { | |
+ srcLoader = vld1_s16(src[6] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 2); | |
+ srcLoader = vld1_s16(src[6] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 2); | |
+ } | |
+ if (filterSize > 7) { | |
+ srcLoader = vld1_s16(src[7] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecB, 3); | |
+ srcLoader = vld1_s16(src[7] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecB, 3); | |
+ } | |
+ if (filterSize > 8) { | |
+ srcLoader = vld1_s16(src[8] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 0); | |
+ srcLoader = vld1_s16(src[8] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 0); | |
+ } | |
+ if (filterSize > 9) { | |
+ srcLoader = vld1_s16(src[9] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 1); | |
+ srcLoader = vld1_s16(src[9] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 1); | |
+ } | |
+ if (filterSize > 10) { | |
+ srcLoader = vld1_s16(src[10] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 2); | |
+ srcLoader = vld1_s16(src[10] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 2); | |
+ } | |
+ if (filterSize > 11) { | |
+ srcLoader = vld1_s16(src[11] + i); | |
+ valsA = vmlal_lane_s16(valsA, srcLoader, filterVecC, 3); | |
+ srcLoader = vld1_s16(src[11] + i + 4); | |
+ valsB = vmlal_lane_s16(valsB, srcLoader, filterVecC, 3); | |
+ } | |
+ | |
+ valsA = vshrq_n_s32(valsA, 16); | |
+ valsB = vshrq_n_s32(valsB, 16); | |
+ destA_t = vshrn_n_s32(valsA, 3); | |
+ destB_t = vshrn_n_s32(valsB, 3); | |
+ dest_t = vcombine_s16(destA_t, destB_t); | |
+ destVec = vqmovun_s16(dest_t); | |
+ vst1_u8(dest + i, destVec); | |
+ } | |
+} | |
+ | |
+//bilinear - Vertical 3 taps filter 1080p -> 720p (1080 -> 720) | |
+static inline void yuv2planeX_8_neon_v3(const int16_t *filter, const int16_t **src, | |
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset) | |
+{ | |
+ int i; | |
+ int32x4_t valsA, valsB; | |
+ int16x4_t destA_t, destB_t; | |
+ int16x8_t dest_t; | |
+ uint8x8_t destVec; | |
+ int32x4_t ditherA, ditherB; | |
+ int16x4_t filterVecA; | |
+ int16x8_t srcLoader0_16x8; | |
+ int16x8_t srcLoader1_16x8; | |
+ int16x8_t srcLoader2_16x8; | |
+ | |
+ //preload to hide load latency | |
+ srcLoader0_16x8 = vld1q_s16(src[0]); | |
+ srcLoader1_16x8 = vld1q_s16(src[1]); | |
+ srcLoader2_16x8 = vld1q_s16(src[2]); | |
+ | |
+ ditherA = vdupq_n_u32(0); | |
+ ditherB = vdupq_n_u32(0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3); | |
+ | |
+ //load the filter | |
+ filterVecA = vld1_s16(filter); | |
+ filterVecA = vset_lane_s16(0, filterVecA, 3); | |
+ | |
+ for (i = 0; i < dstW; i += 8) { | |
+ | |
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2); | |
+ | |
+ //load for next loop iteration | |
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8); | |
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8); | |
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8); | |
+ | |
+ //this looks good | |
+ valsA = vshrq_n_s32(valsA, 16); | |
+ valsB = vshrq_n_s32(valsB, 16); | |
+ destA_t = vshrn_n_s32(valsA, 3); | |
+ destB_t = vshrn_n_s32(valsB, 3); | |
+ dest_t = vcombine_s16(destA_t, destB_t); | |
+ destVec = vqmovun_s16(dest_t); | |
+ vst1_u8(dest + i, destVec); | |
+ } | |
+} | |
+ | |
+//bicubic - Vertical 6 taps filter 1080p -> 720p (1080 -> 720) | |
+static void yuv2planeX_8_neon_v6(const int16_t * filter, const int16_t ** src, | |
+ uint8_t * restrict dest, int dstW, const uint8_t *dither, int offset) | |
+{ | |
+ int i; | |
+ int32x4_t valsA, valsB; | |
+ int16x4_t destA_t, destB_t; | |
+ int16x8_t dest_t; | |
+ uint8x8_t destVec; | |
+ int32x4_t ditherA, ditherB; | |
+ int16x4_t filterVecA, filterVecB; | |
+ int16x8_t srcLoader0_16x8; | |
+ int16x8_t srcLoader1_16x8; | |
+ int16x8_t srcLoader2_16x8; | |
+ int16x8_t srcLoader3_16x8; | |
+ int16x8_t srcLoader4_16x8; | |
+ int16x8_t srcLoader5_16x8; | |
+ | |
+ ditherA = vdupq_n_u32(0); | |
+ ditherB = vdupq_n_u32(0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3); | |
+ | |
+ //load the filters | |
+ filterVecA = vld1_s16(filter); | |
+ filterVecB = vld1_s16(filter + 4); | |
+ filterVecB = vset_lane_s16(0, filterVecB, 3); | |
+ filterVecB = vset_lane_s16(0, filterVecB, 2); | |
+ | |
+ //preload to hide load latency | |
+ srcLoader0_16x8 = vld1q_s16(src[0]); | |
+ srcLoader1_16x8 = vld1q_s16(src[1]); | |
+ srcLoader2_16x8 = vld1q_s16(src[2]); | |
+ srcLoader3_16x8 = vld1q_s16(src[3]); | |
+ srcLoader4_16x8 = vld1q_s16(src[4]); | |
+ srcLoader5_16x8 = vld1q_s16(src[5]); | |
+ | |
+ for (i = 0; i < dstW; i += 8) { | |
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1); | |
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8); //load for next loop iteration, hides latency | |
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader3_16x8), filterVecA, 3); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader3_16x8), filterVecA, 3); | |
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8); | |
+ srcLoader3_16x8 = vld1q_s16(src[3] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader4_16x8), filterVecB, 0); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader4_16x8), filterVecB, 0); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader5_16x8), filterVecB, 1); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader5_16x8), filterVecB, 1); | |
+ srcLoader4_16x8 = vld1q_s16(src[4] + i + 8); | |
+ srcLoader5_16x8 = vld1q_s16(src[5] + i + 8); | |
+ | |
+ valsA = vshrq_n_s32(valsA, 16); | |
+ valsB = vshrq_n_s32(valsB, 16); | |
+ destA_t = vshrn_n_s32(valsA, 3); | |
+ destB_t = vshrn_n_s32(valsB, 3); | |
+ dest_t = vcombine_s16(destA_t, destB_t); | |
+ destVec = vqmovun_s16(dest_t); | |
+ vst1_u8(dest + i, destVec); | |
+ } | |
+} | |
+ | |
+//bicubic - Vertical 9 taps filter 1080p -> 480p (1080 -> 480) | |
+static void yuv2planeX_8_neon_v9(const int16_t * filter, const int16_t ** src, | |
+ uint8_t * restrict dest, int dstW, const uint8_t *dither, int offset) | |
+{ | |
+ int i; | |
+ int32x4_t valsA, valsB; | |
+ int16x4_t destA_t, destB_t; | |
+ int16x8_t dest_t; | |
+ uint8x8_t destVec; | |
+ int32x4_t ditherA, ditherB; | |
+ int16x4_t filterVecA, filterVecB, filterVecC; | |
+ int16x8_t srcLoader0_16x8; | |
+ int16x8_t srcLoader1_16x8; | |
+ int16x8_t srcLoader2_16x8; | |
+ int16x8_t srcLoader3_16x8; | |
+ int16x8_t srcLoader4_16x8; | |
+ int16x8_t srcLoader5_16x8; | |
+ int16x8_t srcLoader6_16x8; | |
+ int16x8_t srcLoader7_16x8; | |
+ int16x8_t srcLoader8_16x8; | |
+ | |
+ ditherA = vdupq_n_u32(0); | |
+ ditherB = vdupq_n_u32(0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 0) & 7] << 12, ditherA, 0); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 1) & 7] << 12, ditherA, 1); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 2) & 7] << 12, ditherA, 2); | |
+ ditherA = vsetq_lane_s16(dither[(offset + 3) & 7] << 12, ditherA, 3); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 4) & 7] << 12, ditherB, 0); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 5) & 7] << 12, ditherB, 1); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 6) & 7] << 12, ditherB, 2); | |
+ ditherB = vsetq_lane_s16(dither[(offset + 7) & 7] << 12, ditherB, 3); | |
+ | |
+ filterVecA = vld1_s16(filter); | |
+ filterVecB = vld1_s16(filter + 4); | |
+ filterVecC = vld1_s16(filter + 8); | |
+ filterVecC = vset_lane_s16(0, filterVecC, 3); | |
+ filterVecC = vset_lane_s16(0, filterVecC, 2); | |
+ filterVecC = vset_lane_s16(0, filterVecC, 1); | |
+ | |
+ //preload to hide load latency | |
+ srcLoader0_16x8 = vld1q_s16(src[0]); | |
+ srcLoader1_16x8 = vld1q_s16(src[1]); | |
+ srcLoader2_16x8 = vld1q_s16(src[2]); | |
+ srcLoader3_16x8 = vld1q_s16(src[3]); | |
+ srcLoader4_16x8 = vld1q_s16(src[4]); | |
+ srcLoader5_16x8 = vld1q_s16(src[5]); | |
+ srcLoader6_16x8 = vld1q_s16(src[6]); | |
+ srcLoader7_16x8 = vld1q_s16(src[7]); | |
+ srcLoader8_16x8 = vld1q_s16(src[8]); | |
+ | |
+ for (i = 0; i < dstW; i += 8) { | |
+ valsA = vmlal_lane_s16(ditherA, vget_low_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsB = vmlal_lane_s16(ditherB, vget_high_s16(srcLoader0_16x8), filterVecA, 0); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader1_16x8), filterVecA, 1); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader1_16x8), filterVecA, 1); | |
+ srcLoader0_16x8 = vld1q_s16(src[0] + i + 8); //load for next loop iteration, hides latency | |
+ srcLoader1_16x8 = vld1q_s16(src[1] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader2_16x8), filterVecA, 2); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader2_16x8), filterVecA, 2); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader3_16x8), filterVecA, 3); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader3_16x8), filterVecA, 3); | |
+ srcLoader2_16x8 = vld1q_s16(src[2] + i + 8); | |
+ srcLoader3_16x8 = vld1q_s16(src[3] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader4_16x8), filterVecB, 0); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader4_16x8), filterVecB, 0); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader5_16x8), filterVecB, 1); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader5_16x8), filterVecB, 1); | |
+ srcLoader4_16x8 = vld1q_s16(src[4] + i + 8); | |
+ srcLoader5_16x8 = vld1q_s16(src[5] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader6_16x8), filterVecB, 2); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader6_16x8), filterVecB, 2); | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader7_16x8), filterVecB, 3); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader7_16x8), filterVecB, 3); | |
+ srcLoader6_16x8 = vld1q_s16(src[6] + i + 8); | |
+ srcLoader7_16x8 = vld1q_s16(src[7] + i + 8); | |
+ | |
+ valsA = vmlal_lane_s16(valsA, vget_low_s16(srcLoader8_16x8), filterVecC, 0); | |
+ valsB = vmlal_lane_s16(valsB, vget_high_s16(srcLoader8_16x8), filterVecC, 0); | |
+ srcLoader8_16x8 = vld1q_s16(src[8] + i + 8); | |
+ | |
+ valsA = vshrq_n_s32(valsA, 16); | |
+ valsB = vshrq_n_s32(valsB, 16); | |
+ destA_t = vshrn_n_s32(valsA, 3); | |
+ destB_t = vshrn_n_s32(valsB, 3); | |
+ dest_t = vcombine_s16(destA_t, destB_t); | |
+ destVec = vqmovun_s16(dest_t); | |
+ vst1_u8(dest + i, destVec); | |
+ } | |
+} | |
+ | |
+static void yuv2planeX_8_neon(const int16_t *filter, int filterSize, const int16_t **src, | |
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset) | |
+{ | |
+ static int alreadyPrinted = 0; | |
+ int optFuncFound = 1; | |
+ | |
+ if (filterSize == 1) | |
+ yuv2planeX_8_neon_template(filter, 1, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 2) | |
+ yuv2planeX_8_neon_template(filter, 2, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 3) | |
+ yuv2planeX_8_neon_v3(filter, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 4) | |
+ yuv2planeX_8_neon_template(filter, 4, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 5) | |
+ yuv2planeX_8_neon_template(filter, 5, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 6) | |
+ yuv2planeX_8_neon_v6(filter, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 7) | |
+ yuv2planeX_8_neon_template(filter, 7, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 8) | |
+ yuv2planeX_8_neon_template(filter, 8, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 9) | |
+ yuv2planeX_8_neon_v9(filter, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 10) | |
+ yuv2planeX_8_neon_template(filter, 10, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 11) | |
+ yuv2planeX_8_neon_template(filter, 11, src, dest, dstW, dither, offset); | |
+ else if (filterSize == 12) | |
+ yuv2planeX_8_neon_template(filter, 12, src, dest, dstW, dither, offset); | |
+ else | |
+ optFuncFound = 0; | |
+ | |
+ if (! optFuncFound) | |
+ yuv2planeX_8_c(filter, filterSize, src, dest, dstW, dither, offset); | |
+ | |
+ if (!alreadyPrinted) { | |
+ if (optFuncFound) | |
+ fprintf(stderr, "filtersize supported in yuv2planeX_8_neon! (%d)\n", filterSize); | |
+ else | |
+ fprintf(stderr, "filtersize not supported in yuv2planeX_8_neon! (%d)\n", filterSize); | |
+ alreadyPrinted = 1; | |
+ } | |
+} | |
+ | |
+void ff_sws_init_output_funcs_neon(SwsContext *c, yuv2planarX_fn *yuv2planeX) | |
+{ | |
+ enum AVPixelFormat dstFormat = c->dstFormat; | |
+ | |
+ if (is16BPS(dstFormat)) | |
+ fprintf(stderr, "Not using yuv2planeX_8_neon is16BPS \n"); | |
+ else if (is9_OR_10BPS(dstFormat)) | |
+ fprintf(stderr, "Not using yuv2planeX_8_neon is9_OR_10BPS \n"); | |
+ else | |
+ *yuv2planeX = yuv2planeX_8_neon; | |
+} | |
+ | |
diff --git a/libswscale/neon/swscale_neon.c b/libswscale/neon/swscale_neon.c | |
new file mode 100644 | |
index 0000000..2114143 | |
--- /dev/null | |
+++ b/libswscale/neon/swscale_neon.c | |
@@ -0,0 +1,892 @@ | |
+/* | |
+ * Neon-enhanced hScale | |
+ * | |
+ * based on the equivalent C code in libswscale | |
+ * | |
+ * This file is part of FFmpeg. | |
+ * | |
+ * FFmpeg is free software; you can redistribute it and/or | |
+ * modify it under the terms of the GNU Lesser General Public | |
+ * License as published by the Free Software Foundation; either | |
+ * version 2.1 of the License, or (at your option) any later version. | |
+ * | |
+ * FFmpeg is distributed in the hope that it will be useful, | |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
+ * Lesser General Public License for more details. | |
+ * | |
+ * You should have received a copy of the GNU Lesser General Public | |
+ * License along with FFmpeg; if not, write to the Free Software | |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
+ */ | |
+ | |
+//Arm reference | |
+//http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHJBEFE.html | |
+#include <inttypes.h> | |
+ | |
+#include "config.h" | |
+#include "libswscale/swscale.h" | |
+#include "libswscale/swscale_internal.h" | |
+#include "libavutil/attributes.h" | |
+#include "libavutil/cpu.h" | |
+#include <arm_neon.h> | |
+#include <stdio.h> | |
+ | |
+#if ARCH_ARM | |
+#define PLD_NEXT(pointer) \ | |
+ __asm__( "pld [%0, #256] \n\t" : : "r"(pointer)); | |
+#elif ARCH_AARCH64 | |
+#define PLD_NEXT(pointer) \ | |
+ __asm__( "prfm pldl1keep, %a0 \n\t" : : "p"(pointer + 256)); | |
+#else | |
+#warning __FILE__ should be used with arm/aarch64 | |
+#define PLD_NEXT(pointer) | |
+#endif | |
+ | |
+#define FFMIN_ARM(a,b) a | |
+ | |
+// Optimized Filter Functions | |
+//*********************************************************************************************************************** | |
+//*********************************************************************************************************************** | |
+//*********************************************************************************************************************** | |
+ | |
+//Horizontal 3 tap filter 1080 -> 720p | |
+static void h3_1080p_1280x720_neon_bilinear(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src) | |
+ | |
+{ | |
+ | |
+ int16x8_t filter_a; | |
+ uint8x16_t in_8x16; | |
+ int srcPos; | |
+ int i = 0; | |
+ int left_val = 0; | |
+ int val = 0; | |
+ int val2 = 0; | |
+ int16x4_t unpack; | |
+ uint8x8_t in; | |
+ uint8x8_t lo; | |
+ uint8x8_t hi; | |
+ int16x4_t out_16x4; | |
+ int16x8_t src_unpack; | |
+ int32x4_t accum; | |
+ int32x2_t aggregate; | |
+ int16x4_t filter_lo; | |
+ int16x4_t filter_hi; | |
+ | |
+ //set up the bilinear filter | |
+ filter_a = vdupq_n_s16(0); | |
+ filter_a = vsetq_lane_s16(5461, filter_a, 0); | |
+ filter_a = vsetq_lane_s16(9103, filter_a, 1); | |
+ filter_a = vsetq_lane_s16(1820, filter_a, 2); | |
+ filter_a = vsetq_lane_s16(1820, filter_a, 5); | |
+ filter_a = vsetq_lane_s16(9103, filter_a, 6); | |
+ filter_a = vsetq_lane_s16(5461, filter_a, 7); | |
+ filter_lo = vget_low_s16(filter_a); | |
+ filter_hi = vget_high_s16(filter_a); | |
+ | |
+ //right edge | |
+ left_val = ((int) src[0]) * 10240; | |
+ left_val += ((int) src[1]) * 6144; | |
+ left_val >>= 7; | |
+ | |
+ //****************************************************************************************** | |
+ //****************************************************************************************** | |
+ //****************************************************************************************** | |
+ //this is WEIRD OFFSET LOADS but the unalignement get fine performance | |
+ srcPos = -5; | |
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x1-x16 | |
+ | |
+ // middle values ---------------------------------------------------------------------------- | |
+ for (i = 0; i < dstW - 1; i += 4) { | |
+ | |
+ srcPos += 6; | |
+ lo = vget_low_u8(in_8x16); //x1-x8 | |
+ hi = vget_high_u8(in_8x16); //x9-x16 | |
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x7-x23 for next loop iteration | |
+ | |
+ //--------------------------------------------------------------- | |
+ src_unpack = vmovl_u8(lo); | |
+ unpack = vget_low_s16(src_unpack); //promote to 16-bits | |
+ | |
+ accum = vmull_s16(filter_lo, unpack); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0) >> 7; | |
+ | |
+ accum = vmull_s16(filter_hi, unpack); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val2 = vget_lane_s32(aggregate, 0) >> 7; | |
+ | |
+ //store the values | |
+ out_16x4 = vset_lane_s16(left_val, out_16x4, 0); | |
+ out_16x4 = vset_lane_s16(val, out_16x4, 1); | |
+ out_16x4 = vset_lane_s16(val2, out_16x4, 2); | |
+ | |
+ //next 2 values ------------------------------------------------------------------------- | |
+ in = vext_u8(lo, hi, 3); //x4-x11 | |
+ src_unpack = vmovl_u8(in); | |
+ unpack = vget_low_s16(src_unpack); //promote to 16-bits | |
+ | |
+ accum = vmull_s16(filter_lo, unpack); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0) >> 7; | |
+ | |
+ accum = vmull_s16(filter_hi, unpack); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ left_val = vget_lane_s32(aggregate, 0) >> 7; //for next loop iteration ... | |
+ | |
+ //store the values out | |
+ out_16x4 = vset_lane_s16(val, out_16x4, 3); | |
+ vst1_s16(&dst[i], out_16x4); //store out 64-bits | |
+ | |
+ } | |
+ | |
+ //****************************************************************************************** | |
+ //****************************************************************************************** | |
+ //****************************************************************************************** | |
+ //right edge ---------------------------------------------------------- | |
+ i = dstW - 3; | |
+ | |
+ //right edge | |
+ val = ((int) src[srcPos + 0]) * 5461; | |
+ val += ((int) src[srcPos + 1]) * 9103; | |
+ val += ((int) src[srcPos + 2]) * 1820; | |
+ dst[i] = val >> 7; | |
+ | |
+ val = ((int) src[srcPos + 1]) * 1820; | |
+ val += ((int) src[srcPos + 2]) * 9103; | |
+ val += ((int) src[srcPos + 3]) * 5461; | |
+ dst[i + 1] = val >> 7; | |
+ | |
+ val = ((int) src[srcPos + 3]) * 5461; | |
+ val += ((int) src[srcPos + 4]) * 10923; | |
+ dst[i + 2] = val >> 7; | |
+ | |
+} | |
+ | |
+//Horizontal 6 tap filter 1080 -> 720p | |
+static void h6_1080p_1280x720_neon_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t *filterPos) | |
+{ | |
+ | |
+ int i; | |
+ int32_t val; | |
+ int32_t val2; | |
+ int srcPos; | |
+ int16x8_t filter_a; | |
+ int16x8_t filter_b; | |
+ uint8x16_t in_8x16 = vld1q_u8(src); | |
+ | |
+ //Left edge filter --------------------------------------------------- | |
+ val = ((int) src[0]) * 11014; | |
+ val += ((int) src[1]) * 6280; | |
+ val += ((int) src[2]) * -758; | |
+ val += ((int) src[3]) * -152; | |
+ dst[0] = val >> 7; | |
+ | |
+ val2 = ((int) src[0]) * -752; | |
+ val2 += ((int) src[1]) * 6223; | |
+ val2 += ((int) src[2]) * 10171; | |
+ val2 += ((int) src[3]) * 1554; | |
+ val2 += ((int) src[4]) * -812; | |
+ dst[1] = val2 >> 7; | |
+ | |
+ //middle values ----------------------------------------------------- | |
+ filter_a = vdupq_n_s16(0); | |
+ filter_b = vdupq_n_s16(0); | |
+ filter_a = vsetq_lane_s16(-819, filter_a, 1); | |
+ filter_a = vsetq_lane_s16(1567, filter_a, 2); | |
+ filter_a = vsetq_lane_s16(10266, filter_a, 3); | |
+ filter_a = vsetq_lane_s16(6280, filter_a, 4); | |
+ filter_a = vsetq_lane_s16(-758, filter_a, 5); | |
+ filter_a = vsetq_lane_s16(-152, filter_a, 6); | |
+ filter_b = vsetq_lane_s16(-152, filter_b, 2); | |
+ filter_b = vsetq_lane_s16(-758, filter_b, 3); | |
+ filter_b = vsetq_lane_s16(6280, filter_b, 4); | |
+ filter_b = vsetq_lane_s16(10266, filter_b, 5); | |
+ filter_b = vsetq_lane_s16(1567, filter_b, 6); | |
+ filter_b = vsetq_lane_s16(-819, filter_b, 7); | |
+ | |
+ srcPos = 0; | |
+ for (i = 0; i < dstW - 4; i += 4) { | |
+ | |
+ uint8x8_t in; | |
+ uint8x8_t lo; | |
+ uint8x8_t hi; | |
+ int16x4_t out_16x4; | |
+ int16x8_t src_unpack; | |
+ int32x4_t accum; | |
+ int32x2_t aggregate; | |
+ int32x4_t accum2; | |
+ int32x2_t aggregate2; | |
+ | |
+ //-------- loop 1 | |
+ lo = vget_low_u8(in_8x16); //x0-x7 | |
+ hi = vget_high_u8(in_8x16); //x8-x15 | |
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); //x0-x15 | |
+ | |
+ //first filter | |
+ src_unpack = vmovl_u8(lo); | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(src_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(src_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0) >> 7; | |
+ | |
+ //second filter | |
+ accum2 = vmull_s16(vget_low_s16(filter_b), vget_low_s16(src_unpack)); | |
+ accum2 = vmlal_s16(accum2, vget_high_s16(filter_b), vget_high_s16(src_unpack)); | |
+ aggregate2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); | |
+ aggregate2 = vpadd_s32(aggregate2, aggregate2); | |
+ val2 = vget_lane_s32(aggregate2, 0) >> 7; | |
+ | |
+ //store the values | |
+ out_16x4 = vset_lane_s16(val, out_16x4, 0); | |
+ out_16x4 = vset_lane_s16(val2, out_16x4, 1); | |
+ | |
+ //-------- loop 2 | |
+ in = vext_u8(lo, hi, 3); //x3-x10 | |
+ src_unpack = vmovl_u8(in); | |
+ | |
+ //first filter | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(src_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(src_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0) >> 7; | |
+ | |
+ //second filter | |
+ accum2 = vmull_s16(vget_low_s16(filter_b), vget_low_s16(src_unpack)); | |
+ accum2 = vmlal_s16(accum2, vget_high_s16(filter_b), vget_high_s16(src_unpack)); | |
+ aggregate2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); | |
+ aggregate2 = vpadd_s32(aggregate2, aggregate2); | |
+ val2 = vget_lane_s32(aggregate2, 0) >> 7; | |
+ | |
+ //store the values | |
+ out_16x4 = vset_lane_s16(val, out_16x4, 2); | |
+ out_16x4 = vset_lane_s16(val2, out_16x4, 3); | |
+ vst1_s16(&dst[i + 2], out_16x4); | |
+ | |
+ //increment the input pointer | |
+ srcPos += 6; | |
+ } | |
+ | |
+ //right edge filter ----------------------------------------------------------- | |
+ i = dstW - 2; | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 1]) * -819; | |
+ val += ((int) src[srcPos + 2]) * 1567; | |
+ val += ((int) src[srcPos + 3]) * 10266; | |
+ val += ((int) src[srcPos + 4]) * 6280; | |
+ val += ((int) src[srcPos + 5]) * -910; | |
+ dst[i] = val >> 7; | |
+ | |
+ val2 = ((int) src[srcPos + 2]) * -152; | |
+ val2 += ((int) src[srcPos + 3]) * -758; | |
+ val2 += ((int) src[srcPos + 4]) * 6280; | |
+ val2 += ((int) src[srcPos + 5]) * 11014; | |
+ dst[i + 1] = val2 >> 7; | |
+ | |
+} | |
+ | |
+//Horizontal 8 tap filter 4k -> 1080p | |
+static void h8_3840x2160_1920x1080_c_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t *filterPos) | |
+{ | |
+ | |
+ int i; | |
+ int srcPos; | |
+ int val; | |
+ | |
+ //left edge filters -------------------------------------------- | |
+ val = ((int) src[0]) * 8192; | |
+ val += ((int) src[1]) * 7142; | |
+ val += ((int) src[2]) * 1972; | |
+ val += ((int) src[3]) * -692; | |
+ val += ((int) src[4]) * -230; | |
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ val = ((int) src[0]) * -922; | |
+ val += ((int) src[1]) * 1972; | |
+ val += ((int) src[2]) * 7142; | |
+ val += ((int) src[3]) * 7142; | |
+ val += ((int) src[4]) * 1972; | |
+ val += ((int) src[5]) * -692; | |
+ val += ((int) src[6]) * -230; | |
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ //middle values | |
+ srcPos = filterPos[2]; | |
+ for (i = 2; i < dstW - 2; i++) { | |
+ val = (int) (((int) src[srcPos + 0]) + ((int) src[srcPos + 7])) * -230; | |
+ val += (int) (((int) src[srcPos + 1]) + ((int) src[srcPos + 6])) * -692; | |
+ val += (int) (((int) src[srcPos + 2]) + ((int) src[srcPos + 5])) * 1972; | |
+ val += (int) (((int) src[srcPos + 3]) + ((int) src[srcPos + 4])) * 7142; | |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ srcPos += 2; | |
+ } | |
+ | |
+ //right edge filter | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 1]) * -230; | |
+ val += ((int) src[srcPos + 2]) * -692; | |
+ val += ((int) src[srcPos + 3]) * 1972; | |
+ val += ((int) src[srcPos + 4]) * 7142; | |
+ val += ((int) src[srcPos + 5]) * 7142; | |
+ val += ((int) src[srcPos + 6]) * 1972; | |
+ val += ((int) src[srcPos + 7]) * -922; | |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ val = ((int) src[srcPos + 3]) * -230; | |
+ val += ((int) src[srcPos + 4]) * -692; | |
+ val += ((int) src[srcPos + 5]) * 1972; | |
+ val += ((int) src[srcPos + 6]) * 7142; | |
+ val += ((int) src[srcPos + 7]) * 8192; | |
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+} | |
+ | |
+//Horizontal 9 tap filter 1080p -> 480p (1920 -> 852) | |
+static void h9_1080p_852x480_neon_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int16_t * restrict filter, | |
+ const int32_t *filterPos) | |
+{ | |
+ int i; | |
+ int val; | |
+ int16x8_t zero_16x8 = vdupq_n_s16(0); | |
+ uint8x8_t lo; | |
+ uint8x8_t hi; | |
+ int8x16_t in_8x16; | |
+ int16x8_t lo_unpack; | |
+ int16x8_t hi_unpack; | |
+ int32x4_t accum; | |
+ int32x2_t aggregate; | |
+ int16x8_t filter_a; | |
+ int16x8_t filter_b; | |
+ int j = 0; | |
+ | |
+ //preload to hide load latency | |
+ in_8x16 = vld1q_u8(src); | |
+ | |
+ for (i = 0; i < dstW; i++) { | |
+ int srcPos = filterPos[i + 1]; //load for next loop iteration | |
+ | |
+ //load filter values | |
+ filter_a = vld1q_s16(&filter[j]); | |
+ filter_b = vsetq_lane_s16(filter[j + 8], zero_16x8, 0); | |
+ | |
+ lo = vget_low_u8(in_8x16); //x0-x7 | |
+ hi = vget_high_u8(in_8x16); //x8-x15 | |
+ in_8x16 = vld1q_u8(&src[srcPos]); //x0-x15 load for next loop iteration | |
+ | |
+ lo_unpack = vmovl_u8(lo); | |
+ hi_unpack = vmovl_u8(hi); | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_low_s16(filter_b), vget_low_s16(hi_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ j += 9; | |
+ } | |
+} | |
+ | |
+ | |
+//Horizontal 11 tap filter //4K -> 720p and 1080p -> 480p (640x480) also 1080 -> 720p | |
+static void h11_1080p_640x480_neon_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t *filterPos) | |
+{ | |
+ int i; | |
+ int32_t val; | |
+ int srcPos; | |
+ uint8x16_t in_8x16 = vld1q_u8(src); | |
+ uint8x8_t zero_8x8 = vdup_n_u8(0); | |
+ int16x8_t filter_a; | |
+ | |
+ //left edge filters -------------------------------------------- | |
+ val = ((int) src[0]) * 5461; | |
+ val += ((int) src[1]) * 5462; | |
+ val += ((int) src[2]) * 4288; | |
+ val += ((int) src[3]) * 1901; | |
+ val += ((int) src[5]) * -485; | |
+ val += ((int) src[6]) * -243; | |
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ val = ((int) src[0]) * -728; | |
+ val += ((int) src[2]) * 1901; | |
+ val += ((int) src[3]) * 4288; | |
+ val += ((int) src[4]) * 5462; | |
+ val += ((int) src[5]) * 4288; | |
+ val += ((int) src[6]) * 1901; | |
+ val += ((int) src[8]) * -485; | |
+ val += ((int) src[9]) * -243; | |
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ //middle values -------------------------------------------- | |
+ filter_a = vdupq_n_s16(0); | |
+ filter_a = vsetq_lane_s16(-243, filter_a, 2); | |
+ filter_a = vsetq_lane_s16(-485, filter_a, 3); | |
+ filter_a = vsetq_lane_s16(1901, filter_a, 5); | |
+ filter_a = vsetq_lane_s16(4288, filter_a, 6); | |
+ filter_a = vsetq_lane_s16(5462, filter_a, 7); | |
+ | |
+ srcPos = 0; | |
+ for (i = 0; i < dstW - 4; i += 2) { | |
+ uint8x8_t in_rev; | |
+ uint8x8_t lo; | |
+ uint8x8_t hi; | |
+ int32x4_t accum; | |
+ int32x2_t aggregate; | |
+ uint16x8_t tmp_16x8; | |
+ uint8x8_t tmp_8x8; | |
+ | |
+ //load the 16 samples | |
+ lo = vget_low_u8(in_8x16); //x0-x7 | |
+ hi = vget_high_u8(in_8x16); //x8-x15 | |
+ in_8x16 = vld1q_u8(&src[srcPos + 6]); | |
+ | |
+ //unroll 1 | |
+ in_rev = vext_u8(zero_8x8, hi, 7); //x7-x15 | |
+ in_rev = vrev64_u8(in_rev); //x15-x7 | |
+ | |
+ tmp_16x8 = vaddl_u8(lo, in_rev); //x2+x12|x3+x11| .... | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(tmp_16x8)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(tmp_16x8)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i + 2] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ //unroll 2 | |
+ //shift the input over 3 bytes and do the same thing | |
+ tmp_8x8 = vext_u8(lo, hi, 3); // x3-x10 | |
+ hi = vext_u8(hi, zero_8x8, 3); //x11-x15 | |
+ lo = tmp_8x8; | |
+ | |
+ in_rev = vext_u8(zero_8x8, hi, 7); | |
+ in_rev = vrev64_u8(in_rev); | |
+ tmp_16x8 = vaddl_u8(lo, in_rev); | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(tmp_16x8)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(tmp_16x8)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i + 3] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ srcPos += 6; | |
+ | |
+ } | |
+ | |
+ //right edge values | |
+ i = dstW - 2; | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 1]) * -243; | |
+ val += ((int) src[srcPos + 2]) * -485; | |
+ val += ((int) src[srcPos + 4]) * 1901; | |
+ val += ((int) src[srcPos + 5]) * 4288; | |
+ val += ((int) src[srcPos + 6]) * 5462; | |
+ val += ((int) src[srcPos + 7]) * 4288; | |
+ val += ((int) src[srcPos + 8]) * 1901; | |
+ val += ((int) src[srcPos + 10]) * -728; | |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ srcPos = filterPos[i + 1]; | |
+ val = ((int) src[srcPos + 4]) * -243; | |
+ val += ((int) src[srcPos + 5]) * -485; | |
+ val += ((int) src[srcPos + 7]) * 1901; | |
+ val += ((int) src[srcPos + 8]) * 4288; | |
+ val += ((int) src[srcPos + 9]) * 5462; | |
+ val += ((int) src[srcPos + 10]) * 5461; | |
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+} | |
+ | |
+//Horizontal 11 tap filter // 1080p -> 480p (720x480) also 1080 -> 720p | |
+static void h11_1080p_720x480_neon_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t * restrict filterPos) | |
+{ | |
+ | |
+ int i; | |
+ int j = 0; | |
+ int val = 0; | |
+ int16x8_t filter_a; | |
+ int16x8_t filter_aa; | |
+ int16x8_t filter_b; | |
+ int16x8_t filter_bb; | |
+ int16x8_t filter_c; | |
+ int16x8_t filter_cc; | |
+ uint8x16_t in1_8x16; | |
+ uint8x16_t in2_8x16; | |
+ | |
+ //left edge filter | |
+ int srcPos = filterPos[0]; | |
+ val = ((int) src[srcPos + 0]) * 6110; | |
+ val += ((int) src[srcPos + 1]) * 6081; | |
+ val += ((int) src[srcPos + 2]) * 4037; | |
+ val += ((int) src[srcPos + 3]) * 1022; | |
+ val += ((int) src[srcPos + 4]) * -456; | |
+ val += ((int) src[srcPos + 5]) * -397; | |
+ val += ((int) src[srcPos + 6]) * -13; | |
+ dst[0] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ srcPos = filterPos[1]; | |
+ val = ((int) src[srcPos + 0]) * -794; | |
+ val += ((int) src[srcPos + 1]) * 272; | |
+ val += ((int) src[srcPos + 2]) * 3017; | |
+ val += ((int) src[srcPos + 3]) * 5697; | |
+ val += ((int) src[srcPos + 4]) * 5697; | |
+ val += ((int) src[srcPos + 5]) * 3017; | |
+ val += ((int) src[srcPos + 6]) * 272; | |
+ val += ((int) src[srcPos + 7]) * -546; | |
+ val += ((int) src[srcPos + 8]) * -248; | |
+ dst[1] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ //set the filter taps | |
+ filter_a = vdupq_n_s16(0); | |
+ filter_a = vsetq_lane_s16(-13, filter_a, 1); | |
+ filter_a = vsetq_lane_s16(-395, filter_a, 2); | |
+ filter_a = vsetq_lane_s16(-457, filter_a, 3); | |
+ filter_a = vsetq_lane_s16(1013, filter_a, 4); | |
+ filter_a = vsetq_lane_s16(4026, filter_a, 5); | |
+ filter_a = vsetq_lane_s16(6079, filter_a, 6); | |
+ filter_a = vsetq_lane_s16(4969, filter_a, 7); | |
+ filter_aa = vdupq_n_s16(0); | |
+ filter_aa = vsetq_lane_s16(1978, filter_aa, 0); | |
+ filter_aa = vsetq_lane_s16(-199, filter_aa, 1); | |
+ filter_aa = vsetq_lane_s16(-511, filter_aa, 2); | |
+ filter_aa = vsetq_lane_s16(-106, filter_aa, 3); | |
+ | |
+ filter_b = vdupq_n_s16(0); | |
+ filter_b = vsetq_lane_s16(-105, filter_b, 0); | |
+ filter_b = vsetq_lane_s16(-509, filter_b, 1); | |
+ filter_b = vsetq_lane_s16(-203, filter_b, 2); | |
+ filter_b = vsetq_lane_s16(1964, filter_b, 3); | |
+ filter_b = vsetq_lane_s16(4958, filter_b, 4); | |
+ filter_b = vsetq_lane_s16(6081, filter_b, 5); | |
+ filter_b = vsetq_lane_s16(4039, filter_b, 6); | |
+ filter_b = vsetq_lane_s16(1025, filter_b, 7); | |
+ filter_bb = vdupq_n_s16(0); | |
+ filter_bb = vsetq_lane_s16(-451, filter_bb, 0); | |
+ filter_bb = vsetq_lane_s16(-396, filter_bb, 1); | |
+ filter_bb = vsetq_lane_s16(-14, filter_bb, 2); | |
+ | |
+ filter_c = vdupq_n_s16(0); | |
+ filter_c = vsetq_lane_s16(-248, filter_c, 3); | |
+ filter_c = vsetq_lane_s16(-249, filter_c, 4); | |
+ filter_cc = vdupq_n_s16(0); | |
+ filter_cc = vsetq_lane_s16(-546, filter_cc, 0); | |
+ filter_cc = vsetq_lane_s16(271, filter_cc, 1); | |
+ filter_cc = vsetq_lane_s16(3013, filter_cc, 2); | |
+ filter_cc = vsetq_lane_s16(5695, filter_cc, 3); | |
+ filter_cc = vsetq_lane_s16(5699, filter_cc, 4); | |
+ filter_cc = vsetq_lane_s16(3020, filter_cc, 5); | |
+ filter_cc = vsetq_lane_s16(274, filter_cc, 6); | |
+ filter_cc = vsetq_lane_s16(-545, filter_cc, 7); | |
+ | |
+ //preloads | |
+ srcPos = 0; | |
+ in1_8x16 = vld1q_u8(&src[srcPos]); //x0->x15 | |
+ in2_8x16 = vld1q_u8(&src[srcPos + 16]); //x16->x31 | |
+ | |
+ for (i = 2; i < dstW - 3; i += 3) { | |
+ uint8x8_t lo; | |
+ uint8x8_t lo2; | |
+ uint8x8_t hi; | |
+ int16x8_t lo_unpack; | |
+ int16x8_t hi_unpack; | |
+ int16x8_t lo2_unpack; | |
+ int32x2_t aggregate; | |
+ int32x4_t accum; | |
+ | |
+ //separate them | |
+ lo = vget_low_u8(in1_8x16); //x0-x7 | |
+ hi = vget_high_u8(in1_8x16); //x8-x15 | |
+ lo2 = vget_low_u8(in2_8x16); //x16-x24 | |
+ | |
+ //load for next loop iteration -< overlap here if we unroll the loop | |
+ //could get rid of the extra load. | |
+ in1_8x16 = vcombine_u8(hi, lo2); // vld1q_u8(&src[srcPos+8]); //x0->x16 | |
+ in2_8x16 = vld1q_u8(&src[srcPos + 24]); //x0->x16 | |
+ | |
+ //unpack and promote | |
+ lo_unpack = vmovl_u8(lo); // x0-X7 -> sint16_t | |
+ hi_unpack = vmovl_u8(hi); // x8-X15 -> sint16_t | |
+ lo2_unpack = vmovl_u8(lo2); // x16-X24 -> sint16_t | |
+ | |
+ //unroll 1 | |
+ accum = vmull_s16(vget_low_s16(filter_a), vget_low_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_a), vget_high_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_low_s16(filter_aa), vget_low_s16(hi_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ //unroll 2 | |
+ accum = vmull_s16(vget_low_s16(filter_b), vget_high_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_b), vget_low_s16(hi_unpack)); | |
+ accum = vmlal_s16(accum, vget_low_s16(filter_bb), vget_high_s16(hi_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i + 1] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ //unroll 3 | |
+ accum = vmull_s16(vget_low_s16(filter_c), vget_high_s16(lo_unpack)); | |
+ accum = vmlal_s16(accum, vget_low_s16(filter_cc), vget_low_s16(hi_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_cc), vget_high_s16(hi_unpack)); | |
+ accum = vmlal_s16(accum, vget_high_s16(filter_c), vget_low_s16(lo2_unpack)); | |
+ aggregate = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); | |
+ aggregate = vpadd_s32(aggregate, aggregate); | |
+ val = vget_lane_s32(aggregate, 0); | |
+ dst[i + 2] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ srcPos += 8; | |
+ } | |
+ | |
+ //right edge filters | |
+ i = dstW - 3; | |
+ srcPos = filterPos[i]; | |
+ j = i * 11; | |
+ val = ((int) src[srcPos + 0]) * 104; | |
+ val += ((int) src[srcPos + 1]) * -509; | |
+ val += ((int) src[srcPos + 2]) * -206; | |
+ val += ((int) src[srcPos + 3]) * 1956; | |
+ val += ((int) src[srcPos + 4]) * 4951; | |
+ val += ((int) src[srcPos + 5]) * 6083; | |
+ val += ((int) src[srcPos + 6]) * 4047; | |
+ val += ((int) src[srcPos + 7]) * 1032; | |
+ val += ((int) src[srcPos + 8]) * -454; | |
+ val += ((int) src[srcPos + 9]) * -398; | |
+ val += ((int) src[srcPos + 10]) * -14; | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ i++; | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 2]) * -246; | |
+ val += ((int) src[srcPos + 3]) * -547; | |
+ val += ((int) src[srcPos + 4]) * 266; | |
+ val += ((int) src[srcPos + 5]) * 3005; | |
+ val += ((int) src[srcPos + 6]) * 5691; | |
+ val += ((int) src[srcPos + 7]) * 5703; | |
+ val += ((int) src[srcPos + 8]) * 3028; | |
+ val += ((int) src[srcPos + 9]) * 279; | |
+ val += ((int) src[srcPos + 10]) * -795; | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ j += 11; | |
+ i++; | |
+ | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 4]) * -13; | |
+ val += ((int) src[srcPos + 5]) * -396; | |
+ val += ((int) src[srcPos + 6]) * -456; | |
+ val += ((int) src[srcPos + 7]) * 1017; | |
+ val += ((int) src[srcPos + 8]) * 4031; | |
+ val += ((int) src[srcPos + 9]) * 6080; | |
+ val += ((int) src[srcPos + 10]) * 6121; | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+} | |
+ | |
+//Horizontal 11 tap filter 4k -> 720p | |
+static void h11_3840x2160_1280x720_neon_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t *filterPos) | |
+{ | |
+ h11_1080p_640x480_neon_bicubic(dst, dstW, src, filterPos); | |
+} | |
+ | |
+//Horizontal 24 tap filter 4k -> 480p | |
+static void h24_3840x2160_640x480_c_bicubic(int16_t * restrict dst, int dstW, | |
+ const uint8_t * restrict src, const int32_t *filterPos) | |
+{ | |
+ int i; | |
+ int val; | |
+ int srcPos; | |
+ | |
+ //left edge filter | |
+ val = ((int) src[0]) * 3124; | |
+ val += ((int) src[1]) * 2381; | |
+ val += ((int) src[2]) * 2687; | |
+ val += ((int) src[3]) * 2687; | |
+ val += ((int) src[4]) * 2381; | |
+ val += ((int) src[5]) * 1870; | |
+ val += ((int) src[6]) * 1259; | |
+ val += ((int) src[7]) * 657; | |
+ val += ((int) src[8]) * 169; | |
+ val += ((int) src[9]) * -115; | |
+ val += ((int) src[10]) * -231; | |
+ val += ((int) src[11]) * -232; | |
+ val += ((int) src[12]) * -166; | |
+ val += ((int) src[13]) * -77; | |
+ val += ((int) src[14]) * -10; | |
+ dst[0] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ val = ((int) src[0]) * -485; | |
+ val += ((int) src[1]) * -231; | |
+ val += ((int) src[2]) * -115; | |
+ val += ((int) src[3]) * 169; | |
+ val += ((int) src[4]) * 657; | |
+ val += ((int) src[5]) * 1259; | |
+ val += ((int) src[6]) * 1870; | |
+ val += ((int) src[7]) * 2381; | |
+ val += ((int) src[8]) * 2687; | |
+ val += ((int) src[9]) * 2687; | |
+ val += ((int) src[10]) * 2381; | |
+ val += ((int) src[11]) * 1870; | |
+ val += ((int) src[12]) * 1259; | |
+ val += ((int) src[13]) * 657; | |
+ val += ((int) src[14]) * 169; | |
+ val += ((int) src[15]) * -115; | |
+ val += ((int) src[16]) * -231; | |
+ val += ((int) src[17]) * -232; | |
+ val += ((int) src[18]) * -166; | |
+ val += ((int) src[19]) * -77; | |
+ val += ((int) src[20]) * -10; | |
+ dst[1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ //middle values | |
+ srcPos = filterPos[2]; | |
+ for (i = 2; i < dstW - 2; i++) { | |
+ val = (int) (((int) src[srcPos + 0]) + ((int) src[srcPos + 23])) * -10; | |
+ val += (int) (((int) src[srcPos + 1]) + ((int) src[srcPos + 22])) * -77; | |
+ val += (int) (((int) src[srcPos + 2]) + ((int) src[srcPos + 21])) * -166; | |
+ val += (int) (((int) src[srcPos + 3]) + ((int) src[srcPos + 20])) * -232; | |
+ val += (int) (((int) src[srcPos + 4]) + ((int) src[srcPos + 19])) * -231; | |
+ val += (int) (((int) src[srcPos + 5]) + ((int) src[srcPos + 18])) * -115; | |
+ val += (int) (((int) src[srcPos + 6]) + ((int) src[srcPos + 17])) * 169; | |
+ val += (int) (((int) src[srcPos + 7]) + ((int) src[srcPos + 16])) * 657; | |
+ val += (int) (((int) src[srcPos + 8]) + ((int) src[srcPos + 15])) * 1259; | |
+ val += (int) (((int) src[srcPos + 9]) + ((int) src[srcPos + 14])) * 1870; | |
+ val += (int) (((int) src[srcPos + 10]) + ((int) src[srcPos + 13])) * 2381; | |
+ val += (int) (((int) src[srcPos + 11]) + ((int) src[srcPos + 12])) * 2687; | |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ srcPos += 6; | |
+ } | |
+ | |
+ //right edge filter | |
+ srcPos = filterPos[i]; | |
+ val = ((int) src[srcPos + 3]) * -10; | |
+ val += ((int) src[srcPos + 4]) * -77; | |
+ val += ((int) src[srcPos + 5]) * -166; | |
+ val += ((int) src[srcPos + 6]) * -232; | |
+ val += ((int) src[srcPos + 7]) * -231; | |
+ val += ((int) src[srcPos + 8]) * -115; | |
+ val += ((int) src[srcPos + 9]) * 169; | |
+ val += ((int) src[srcPos + 10]) * 657; | |
+ val += ((int) src[srcPos + 11]) * 1259; | |
+ val += ((int) src[srcPos + 12]) * 1870; | |
+ val += ((int) src[srcPos + 13]) * 2381; | |
+ val += ((int) src[srcPos + 14]) * 2687; | |
+ val += ((int) src[srcPos + 15]) * 2687; | |
+ val += ((int) src[srcPos + 16]) * 2381; | |
+ val += ((int) src[srcPos + 17]) * 1870; | |
+ val += ((int) src[srcPos + 18]) * 1259; | |
+ val += ((int) src[srcPos + 19]) * 657; | |
+ val += ((int) src[srcPos + 20]) * 169; | |
+ val += ((int) src[srcPos + 21]) * -115; | |
+ val += ((int) src[srcPos + 22]) * -231; | |
+ val += ((int) src[srcPos + 23]) * -485; | |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); | |
+ | |
+ val = ((int) src[srcPos + 9]) * -10; | |
+ val += ((int) src[srcPos + 10]) * -77; | |
+ val += ((int) src[srcPos + 11]) * -166; | |
+ val += ((int) src[srcPos + 12]) * -232; | |
+ val += ((int) src[srcPos + 13]) * -231; | |
+ val += ((int) src[srcPos + 14]) * -115; | |
+ val += ((int) src[srcPos + 15]) * 169; | |
+ val += ((int) src[srcPos + 16]) * 657; | |
+ val += ((int) src[srcPos + 17]) * 1259; | |
+ val += ((int) src[srcPos + 18]) * 1870; | |
+ val += ((int) src[srcPos + 19]) * 2381; | |
+ val += ((int) src[srcPos + 20]) * 2687; | |
+ val += ((int) src[srcPos + 21]) * 2687; | |
+ val += ((int) src[srcPos + 22]) * 2381; | |
+ val += ((int) src[srcPos + 23]) * 3124; | |
+ dst[i + 1] = FFMIN(val >> 7, (1 << 15) - 1); | |
+} | |
+ | |
+//****************************************************************************************************************** | |
+//****************************************************************************************************************** | |
+//****************************************************************************************************************** | |
+static void hScale8To15_neon(SwsContext *c, int16_t *dst, int dstW, const uint8_t * src, | |
+ const int16_t * filter, const int32_t *filterPos, int filterSize) | |
+{ | |
+ | |
+ //use the original filter if we are not using the optimized bicubic methods | |
+ if (!(c->flags & SWS_BICUBIC)) { | |
+ | |
+ //should maybe put a check for bilinear .... | |
+ if (((c->srcW == 1920 && dstW == 1280) || (c->srcW == 1920 && dstW == 640)) | |
+ && (filterSize == 3) && (c->flags & SWS_BILINEAR)) { | |
+ h3_1080p_1280x720_neon_bilinear(dst, dstW, src); | |
+ } else { | |
+ //original code default | |
+ int i; | |
+ for (i = 0; i < dstW; i++) { | |
+ int j; | |
+ int srcPos = filterPos[i]; | |
+ int val = 0; | |
+ for (j = 0; j < filterSize; j++) { | |
+ val += ((int) src[srcPos + j]) * filter[filterSize * i + j]; | |
+ } | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ | |
+ } | |
+ } | |
+ | |
+ } else { //use the bicubic optimized methods ---------------------------------------- | |
+ | |
+ //this will prefetch 2 chroma lines and 1 luma line | |
+ for (int i = 0; i < c->srcW; i += 64) | |
+ PLD_NEXT(&src[i]); | |
+ | |
+ // c->srcW is luma width : dstW can be either luma or chroma width | |
+ if (((c->srcW == 1920 && dstW == 1280) || (c->srcW == 1920 && dstW == 640)) | |
+ && (filterSize == 6)) { | |
+ h6_1080p_1280x720_neon_bicubic(dst, dstW, src, filterPos); | |
+ } else if (((c->srcW == 1920 && dstW == 852) || (c->srcW == 1920 && dstW == 426)) | |
+ && (filterSize == 9)) { | |
+ h9_1080p_852x480_neon_bicubic(dst, dstW, src, filter, filterPos); | |
+ } else if (((c->srcW == 3840 && dstW == 1280) || (c->srcW == 3840 && dstW == 640)) | |
+ && (filterSize == 11)) { | |
+ h11_3840x2160_1280x720_neon_bicubic(dst, dstW, src, filterPos); | |
+ } else if (((c->srcW == 1920 && dstW == 640) || (c->srcW == 1920 && dstW == 320)) | |
+ && (filterSize == 11)) { | |
+ h11_1080p_640x480_neon_bicubic(dst, dstW, src, filterPos); | |
+ } else if (((c->srcW == 1920 && dstW == 720) || (c->srcW == 1920 && dstW == 360)) | |
+ && (filterSize == 11)) { | |
+ h11_1080p_720x480_neon_bicubic(dst, dstW, src, filterPos); | |
+ } else if (((c->srcW == 3840 && dstW == 640) || (c->srcW == 3840 && dstW == 320)) | |
+ && (filterSize == 24)) { | |
+ h24_3840x2160_640x480_c_bicubic(dst, dstW, src, filterPos); | |
+ } else if (((c->srcW == 3840 && dstW == 1920) || (c->srcW == 3840 && dstW == 960)) | |
+ && (filterSize == 8)) { | |
+ h8_3840x2160_1920x1080_c_bicubic(dst, dstW, src, filterPos); | |
+ } else { | |
+ //original code default | |
+ int i; | |
+ for (i = 0; i < dstW; i++) { | |
+ int j; | |
+ int srcPos = filterPos[i]; | |
+ int val = 0; | |
+ for (j = 0; j < filterSize; j++) { | |
+ val += ((int) src[srcPos + j]) * filter[filterSize * i + j]; | |
+ } | |
+ dst[i] = FFMIN_ARM(val >> 7, (1 << 15) - 1); | |
+ } | |
+ } | |
+ } //else | |
+} //function | |
+ | |
+av_cold void ff_sws_init_swscale_neon(SwsContext *c) | |
+{ | |
+ if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { | |
+ c->hyScale = c->hcScale = hScale8To15_neon; | |
+ } else { | |
+ fprintf(stderr, "Not using hScale8To15_neon c->srcBpc: %d, c->dstBpc:%d\n", | |
+ c->srcBpc, c->dstBpc); | |
+ } | |
+} | |
+ | |
diff --git a/libswscale/output.c b/libswscale/output.c | |
index 4b70626..7865751 100644 | |
--- a/libswscale/output.c | |
+++ b/libswscale/output.c | |
@@ -250,7 +250,7 @@ yuv2NBPS(14, LE, 0, 10, int16_t) | |
yuv2NBPS(16, BE, 1, 16, int32_t) | |
yuv2NBPS(16, LE, 0, 16, int32_t) | |
-static void yuv2planeX_8_c(const int16_t *filter, int filterSize, | |
+void yuv2planeX_8_c(const int16_t *filter, int filterSize, | |
const int16_t **src, uint8_t *dest, int dstW, | |
const uint8_t *dither, int offset) | |
{ | |
diff --git a/libswscale/swscale.c b/libswscale/swscale.c | |
index 1769348..a722957 100644 | |
--- a/libswscale/swscale.c | |
+++ b/libswscale/swscale.c | |
@@ -676,6 +676,9 @@ static int swscale(SwsContext *c, const uint8_t *src[], | |
* this array's tail */ | |
ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX, | |
&yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX); | |
+ if (HAVE_INTRINSICS_NEON) | |
+ ff_sws_init_output_funcs_neon(c, &yuv2planeX); | |
+ | |
use_mmx_vfilter= 0; | |
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX, | |
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter); | |
@@ -856,6 +859,9 @@ static av_cold void sws_init_swscale(SwsContext *c) | |
&c->yuv2nv12cX, &c->yuv2packed1, | |
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX); | |
+ if (HAVE_INTRINSICS_NEON) | |
+ ff_sws_init_output_funcs_neon(c, &c->yuv2planeX); | |
+ | |
ff_sws_init_input_funcs(c); | |
@@ -889,6 +895,8 @@ SwsFunc ff_getSwsFunc(SwsContext *c) | |
ff_sws_init_swscale_ppc(c); | |
if (ARCH_X86) | |
ff_sws_init_swscale_x86(c); | |
+ if (HAVE_INTRINSICS_NEON) | |
+ ff_sws_init_swscale_neon(c); | |
return swscale; | |
} | |
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h | |
index a53fdc4..c81f481 100644 | |
--- a/libswscale/swscale_internal.h | |
+++ b/libswscale/swscale_internal.h | |
@@ -117,6 +117,12 @@ typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW, | |
typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize, | |
const int16_t **src, uint8_t *dest, int dstW, | |
const uint8_t *dither, int offset); | |
+/** | |
+ * Default/generic c-implementation of yuv2planarX_fn for 8bPS | |
+ */ | |
+void yuv2planeX_8_c(const int16_t *filter, int filterSize, const int16_t **src, | |
+ uint8_t *dest, int dstW, const uint8_t *dither, int offset); | |
+ | |
/** | |
* Write one line of horizontally scaled chroma to interleaved output | |
@@ -891,8 +897,10 @@ void ff_sws_init_output_funcs(SwsContext *c, | |
yuv2packed2_fn *yuv2packed2, | |
yuv2packedX_fn *yuv2packedX, | |
yuv2anyX_fn *yuv2anyX); | |
+void ff_sws_init_output_funcs_neon(SwsContext *c, yuv2planarX_fn *yuv2planeX); | |
void ff_sws_init_swscale_ppc(SwsContext *c); | |
void ff_sws_init_swscale_x86(SwsContext *c); | |
+void ff_sws_init_swscale_neon(SwsContext *c); | |
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, | |
const uint8_t *src, int srcW, int xInc); | |
-- | |
1.7.9.5 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment