Skip to content

Instantly share code, notes, and snippets.

@robin-raymond
Created June 29, 2017 20:41
Show Gist options
  • Save robin-raymond/d6c0afbcad7840131bf46e3f53d1543c to your computer and use it in GitHub Desktop.
Save robin-raymond/d6c0afbcad7840131bf46e3f53d1543c to your computer and use it in GitHub Desktop.
diff --git a/.gitignore b/.gitignore
index b0e9574..5360877 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,6 @@ source/*.o
# Files generated by perf
perf.data
perf.data.old
+*.vcxproj.filters
+*.vcxproj
+libyuv*.sln
\ No newline at end of file
diff --git a/BUILD.gn b/BUILD.gn
index 4f56cdc..1c02a46 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -40,9 +40,15 @@ group("libyuv") {
public_configs = [ ":libyuv_config" ]
if (is_win && target_cpu == "x64") {
- public_deps = [
- ":libyuv_internal(//build/toolchain/win:clang_x64)",
- ]
+ if (is_winuwp) {
+ public_deps = [
+ ":libyuv_internal",
+ ]
+ } else {
+ public_deps = [
+ ":libyuv_internal(//build/toolchain/win:clang_x64)",
+ ]
+ }
} else {
public_deps = [
":libyuv_internal",
@@ -119,6 +125,9 @@ static_library("libyuv_internal") {
defines += [ "HAVE_JPEG" ]
deps += [ "//third_party:jpeg" ]
}
+ if (is_winuwp) {
+ deps += [ "//third_party/winuwp_compat:force_include_std" ]
+ }
if (libyuv_use_neon) {
deps += [ ":libyuv_neon" ]
@@ -257,6 +266,9 @@ if (libyuv_include_tests) {
if (is_android) {
deps += [ "//testing/android/native_test:native_test_native_code" ]
}
+ if (is_winuwp) {
+ deps += [ "//third_party/winuwp_compat:force_include_std" ]
+ }
# TODO(YangZhang): These lines can be removed when high accuracy
# YUV to RGB to Neon is ported.
@@ -287,6 +299,9 @@ if (libyuv_include_tests) {
if (is_linux) {
cflags = [ "-fexceptions" ]
}
+ if (is_winuwp) {
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+ }
}
executable("convert") {
@@ -300,6 +315,9 @@ if (libyuv_include_tests) {
if (is_linux) {
cflags = [ "-fexceptions" ]
}
+ if (is_winuwp) {
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+ }
}
executable("psnr") {
@@ -316,6 +334,9 @@ if (libyuv_include_tests) {
if (!is_ios && !libyuv_disable_jpeg) {
defines = [ "HAVE_JPEG" ]
}
+ if (is_winuwp) {
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+ }
}
executable("cpuid") {
diff --git a/armasm_ms.config b/armasm_ms.config
new file mode 100644
index 0000000..b617231
--- /dev/null
+++ b/armasm_ms.config
@@ -0,0 +1 @@
+-I src -oldit
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 3e5dd20..a1e4722 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -625,7 +625,7 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__)
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
diff --git a/libyuv.gyp b/libyuv.gyp
index f73a1a4..6c72007 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -30,7 +30,7 @@
'build_neon': 0,
'build_msa': 0,
'conditions': [
- ['(target_arch == "armv7" or target_arch == "armv7s" or \
+ ['(OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")) or (target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
and (arm_neon == 1 or arm_neon_optional == 1)', {
'build_neon': 1,
@@ -85,6 +85,35 @@
# '-marm', # arm32 not thumb
],
}],
+ ['OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")', {
+ 'defines': [
+ 'WINUWP',
+ '__ARM_NEON__',
+ ],
+ 'sources': [
+ # sources.
+ 'source/arm_asm_macros.in',
+ 'source/compare_neon.asm',
+ 'source/rotate_neon.asm',
+ 'source/scale_neon.asm',
+ 'source/row_neon.asm'
+ ],
+ 'sources!': [
+ # sources.
+ 'source/compare_neon.cc',
+ 'source/compare_neon64.cc',
+ 'source/rotate_neon.cc',
+ 'source/rotate_neon64.cc',
+ 'source/row_neon.cc',
+ 'source/row_neon64.cc',
+ 'source/scale_neon.cc',
+ 'source/scale_neon64.cc',
+ ],
+ }],
+ ],
+ 'include_dirs': [
+ 'include',
+ '.',
],
}],
['build_msa != 0', {
diff --git a/libyuv_test.gyp b/libyuv_test.gyp
index 88860f5..abb3c89 100644
--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -18,8 +18,8 @@
'type': '<(gtest_target_type)',
'dependencies': [
'libyuv.gyp:libyuv',
- 'testing/gtest.gyp:gtest',
- 'third_party/gflags/gflags.gyp:gflags',
+ '<(DEPTH)/testing/gtest.gyp:gtest',
+ '<(DEPTH)/third_party/gflags/gflags.gyp:gflags',
],
'direct_dependent_settings': {
'defines': [
@@ -49,6 +49,11 @@
'unit_test/video_common_test.cc',
],
'conditions': [
+ ['OS=="win" and OS_RUNTIME=="winuwp"', {
+ 'defines': [
+ 'WINUWP',
+ ],
+ }],
['OS=="linux"', {
'cflags': [
'-fexceptions',
@@ -83,6 +88,7 @@
[ '(target_arch == "armv7" or target_arch == "armv7s" \
or (target_arch == "arm" and arm_version >= 7) \
or target_arch == "arm64") \
+ or winuwp_platform=="win_phone" or winuwp_platform=="win10_arm" \
and (arm_neon == 1 or arm_neon_optional == 1)', {
'defines': [
'LIBYUV_NEON'
@@ -185,7 +191,7 @@
'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
},
'includes': [
- 'build/apk_test.gypi',
+ # 'build/apk_test.gypi',
],
'dependencies': [
'libyuv_unittest',
diff --git a/source/arm_asm_macros.in b/source/arm_asm_macros.in
new file mode 100644
index 0000000..eb54c4b
--- /dev/null
+++ b/source/arm_asm_macros.in
@@ -0,0 +1,22 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ AREA |.text|, CODE, READONLY
+
+ MACRO
+ MEMACCESS $base
+ ; Alternative of MEMACCESS macro defined in row.h.
+ ; Currently assembler source files are used only for Windows Phone (MS armasm compiler),
+ ; so this macro is empty. It is defined for code compatibility.
+ ; Eventually asm. source files should be used for all platforms, so some platforms might require
+ ; some implementation. See row.h (define MEMACCESS) for details.
+ MEND
+
+ END
diff --git a/source/compare_neon.asm b/source/compare_neon.asm
new file mode 100644
index 0000000..85251f1
--- /dev/null
+++ b/source/compare_neon.asm
@@ -0,0 +1,56 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ AREA |.text|, CODE, READONLY, ALIGN=2
+
+ GET source/arm_asm_macros.in
+
+ EXPORT SumSquareError_NEON
+
+SumSquareError_NEON PROC
+ ; input
+ ; r0 = uint8* src_a
+ ; r1 = uint8* src_b
+ ; r3 = int count
+ ; output
+ ; r0 = int
+ vpush {q0, q1, q2, q3}
+ vpush {q8, q9, q10, q11}
+
+ vmov.u8 q8, #0
+ vmov.u8 q10, #0
+ vmov.u8 q9, #0
+ vmov.u8 q11, #0
+loop
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]!
+ MEMACCESS 1
+ vld1.8 {q1}, [r1]!
+ subs r2, r2, #16
+ vsubl.u8 q2, d0, d2
+ vsubl.u8 q3, d1, d3
+ vmlal.s16 q8, d4, d4
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d5, d5
+ vmlal.s16 q11, d7, d7
+ bgt loop
+
+ vadd.u32 q8, q8, q9
+ vadd.u32 q10, q10, q11
+ vadd.u32 q11, q8, q10
+ vpaddl.u32 q1, q11
+ vadd.u64 d0, d2, d3
+ vmov.32 r0, d0[0]
+ vpop {q8, q9, q10, q11}
+ vpop {q0, q1, q2, q3}
+ bx lr
+ ENDP
+
+ END
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 49aa3b4..3148260 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -21,6 +21,15 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
+ /* !!! IMPORTANT: Following method has been ported to pure assembler to compare_neon.asm,
+ * because MS Visual Studio doesn't support inline assembler for ARM.
+ *
+ * ALL CHANGES IN METHOD IMPLEMENTATION HAS TO BE DONE ALSO IN compare_neon.asm
+ *
+ * Eventually, only pure assembler implementation should be used for all platforms
+ * to avoid code duplication.
+ */
+
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index afb5d28..fb462be 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -29,6 +29,10 @@
#include "libyuv/basic_types.h" // For CPU_X86
+#if defined(WINWUP) && defined(_M_ARM)
+ #include <windows.h>
+#endif
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -317,6 +321,13 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
cpu_info &= ~kCpuHasNEON;
}
#endif // __arm__
+#if defined (WINUWP) && defined(_M_ARM)
+ // Windows Runtime on ARM
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+ {
+ cpu_info_ = kCpuHasNEON;
+ }
+#endif
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
}
diff --git a/source/rotate_neon.asm b/source/rotate_neon.asm
new file mode 100644
index 0000000..4d53eed
--- /dev/null
+++ b/source/rotate_neon.asm
@@ -0,0 +1,522 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ AREA |.text|, CODE, READONLY, ALIGN=2
+
+ GET source/arm_asm_macros.in
+
+ EXPORT TransposeWx8_NEON
+ EXPORT TransposeUVWx8_NEON
+
+kVTbl4x4Transpose DCB 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+kVTbl4x4TransposeDi DCB 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+
+TransposeWx8_NEON PROC
+ ; input
+ ; r0 = uint8* src
+ ; r1 = int src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_stride
+
+ push {r4-r6}
+ ldr r4, [sp, #12] ; load parameter int width
+ adr R6, kVTbl4x4Transpose
+ vpush {q0, q1, q2, q3}
+
+ ; loops are on blocks of 8. loop will stop when
+ ; counter gets to or below 0. starting the counter
+ ; at w-8 allow for this
+ sub r4, #8
+
+ ; handle 8x8 blocks. this should be the majority of the plane
+1
+ mov r5, r0
+
+ MEMACCESS 0
+ vld1.8 {d0}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d1}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d2}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d3}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d4}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d5}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d6}, [r5], r1
+ MEMACCESS 0
+ vld1.8 {d7}, [r5]
+
+ vtrn.8 d1, d0
+ vtrn.8 d3, d2
+ vtrn.8 d5, d4
+ vtrn.8 d7, d6
+
+ vtrn.16 d1, d3
+ vtrn.16 d0, d2
+ vtrn.16 d5, d7
+ vtrn.16 d4, d6
+
+ vtrn.32 d1, d5
+ vtrn.32 d0, d4
+ vtrn.32 d3, d7
+ vtrn.32 d2, d6
+
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vrev16.8 q2, q2
+ vrev16.8 q3, q3
+
+ mov r5, r2
+
+ MEMACCESS 0
+ vst1.8 {d1}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d0}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d3}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d2}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d5}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d4}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d7}, [r5], r3
+ MEMACCESS 0
+ vst1.8 {d6}, [r5]
+
+ add r0, #8 ; src += 8
+ add r2, r2, r3, lsl #3 ; dst += 8 * dst_stride
+ subs r4, #8 ; -= 8
+ bge %b1
+
+ ; add 8 back to counter. if the result is 0 there are
+ ; no residuals.
+ adds r4, #8
+ beq %f4
+
+ ; some residual, so between 1 and 7 lines left to transpose
+ cmp r4, #2
+ blt %f3
+
+ cmp r4, #4
+ blt %f2
+
+ ; 4x8 block
+ mov r5, r0
+ MEMACCESS 0
+ vld1.32 {d0[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d0[1]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d1[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d1[1]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d2[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d2[1]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d3[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.32 {d3[1]}, [r5]
+
+ mov r5, r2
+
+ MEMACCESS(6)
+ vld1.8 {q3}, [r6]
+
+ vtbl.8 d4, {d0, d1}, d6
+ vtbl.8 d5, {d0, d1}, d7
+ vtbl.8 d0, {d2, d3}, d6
+ vtbl.8 d1, {d2, d3}, d7
+
+ ; TODO(frkoenig): Rework shuffle above to
+ ; write out with 4 instead of 8 writes.
+ MEMACCESS 0
+ vst1.32 {d4[0]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d4[1]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d5[0]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d5[1]}, [r5]
+
+ add r5, r2, #4
+ MEMACCESS 0
+ vst1.32 {d0[0]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d0[1]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d1[0]}, [r5], r3
+ MEMACCESS 0
+ vst1.32 {d1[1]}, [r5]
+
+ add r0, #4 ; src += 4
+ add r2, r2, r3, lsl #2 ; dst += 4 * dst_stride
+ subs r4, #4 ; w -= 4
+ beq %f4
+
+ ; some residual, check to see if it includes a 2x8 block,
+ ; or less
+ cmp r4, #2
+ blt %f3
+
+ ; 2x8 block
+2
+ mov r5, r0
+ MEMACCESS 0
+ vld1.16 {d0[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d1[0]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d0[1]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d1[1]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d0[2]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d1[2]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d0[3]}, [r5], r1
+ MEMACCESS 0
+ vld1.16 {d1[3]}, [r5]
+
+ vtrn.8 d0, d1
+
+ mov r5, r2
+
+ MEMACCESS 0
+ vst1.64 {d0}, [r5], r3
+ MEMACCESS 0
+ vst1.64 {d1}, [r5]
+
+ add r0, #2 ; src += 2
+ add r2, r2, r3, lsl #1 ; dst += 2 * dst_stride
+ subs r4, #2 ; w -= 2
+ beq %f4
+
+ ; 1x8 block
+3
+ MEMACCESS 1
+ vld1.8 {d0[0]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[1]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[2]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[3]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[4]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[5]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[6]}, [r0], r1
+ MEMACCESS 1
+ vld1.8 {d0[7]}, [r0]
+
+ MEMACCESS(3)
+ vst1.64 {d0}, [r2]
+
+4
+ vpop {q0, q1, q2, q3}
+ pop {r4-r6}
+ bx lr
+ ENDP
+
+TransposeUVWx8_NEON PROC
+ ; input
+ ; r0 = uint8* src
+ ; r1 = int src_stride
+ ; r2 = uint8* dst_a
+ ; r3 = int dst_stride_a
+ push {r4-r8}
+ ldr r5, [sp, #20] ; load uint8* dst_b
+ ldr r6, [sp, #24] ; int dst_stride_b
+ ldr r7, [sp, #28] ; int width
+ adr R8, kVTbl4x4TransposeDi
+ vpush {q0, q1, q2, q3}
+ vpush {q8, q9, q10, q11}
+
+ ; loops are on blocks of 8. loop will stop when
+ ; counter gets to or below 0. starting the counter
+ ; at w-8 allow for this
+ sub r7, #8
+
+ ; handle 8x8 blocks. this should be the majority of the plane
+1
+ mov r4, r0
+
+ MEMACCESS 0
+ vld2.8 {d0, d1}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d2, d3}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d4, d5}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d6, d7}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d16, d17}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d18, d19}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d20, d21}, [r4], r1
+ MEMACCESS 0
+ vld2.8 {d22, d23}, [r4]
+
+ vtrn.8 q1, q0
+ vtrn.8 q3, q2
+ vtrn.8 q9, q8
+ vtrn.8 q11, q10
+
+ vtrn.16 q1, q3
+ vtrn.16 q0, q2
+ vtrn.16 q9, q11
+ vtrn.16 q8, q10
+
+ vtrn.32 q1, q9
+ vtrn.32 q0, q8
+ vtrn.32 q3, q11
+ vtrn.32 q2, q10
+
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vrev16.8 q2, q2
+ vrev16.8 q3, q3
+ vrev16.8 q8, q8
+ vrev16.8 q9, q9
+ vrev16.8 q10, q10
+ vrev16.8 q11, q11
+
+ mov r4, r2
+
+ MEMACCESS 0
+ vst1.8 {d2}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d0}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d6}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d4}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d18}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d16}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d22}, [r4], r3
+ MEMACCESS 0
+ vst1.8 {d20}, [r4]
+
+ mov r4, r5
+
+ MEMACCESS 0
+ vst1.8 {d3}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d1}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d7}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d5}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d19}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d17}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d23}, [r4], r6
+ MEMACCESS 0
+ vst1.8 {d21}, [r4]
+
+ add r0, #8*2 ; src += 8*2
+ add r2, r2, r3, lsl #3 ; dst_a += 8 * dst_stride_a
+ add r5, r5, r6, lsl #3 ; dst_b += 8 * dst_stride_b
+ subs r7, #8 ; w -= 8
+ bge %b1
+
+ ; add 8 back to counter. if the result is 0 there are
+ ; no residuals.
+ adds r7, #8
+ beq %f4
+
+ ; some residual, so between 1 and 7 lines left to transpose
+ cmp r7, #2
+ blt %f3
+
+ cmp r7, #4
+ blt %f2
+
+ ; TODO(frkoenig): Clean this up
+ ; 4x8 block
+ mov r4, r0
+ MEMACCESS 0
+ vld1.64 {d0}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d1}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d2}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d3}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d4}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d5}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d6}, [r4], r1
+ MEMACCESS 0
+ vld1.64 {d7}, [r4]
+
+ MEMACCESS 8
+ vld1.8 {q15}, [r8]
+
+ vtrn.8 q0, q1
+ vtrn.8 q2, q3
+
+ vtbl.8 d16, {d0, d1}, d30
+ vtbl.8 d17, {d0, d1}, d31
+ vtbl.8 d18, {d2, d3}, d30
+ vtbl.8 d19, {d2, d3}, d31
+ vtbl.8 d20, {d4, d5}, d30
+ vtbl.8 d21, {d4, d5}, d31
+ vtbl.8 d22, {d6, d7}, d30
+ vtbl.8 d23, {d6, d7}, d31
+
+ mov r4, r2
+
+ MEMACCESS 0
+ vst1.32 {d16[0]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d16[1]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d17[0]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d17[1]}, [r4], r3
+
+ add r4, r2, #4
+ MEMACCESS 0
+ vst1.32 {d20[0]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d20[1]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d21[0]}, [r4], r3
+ MEMACCESS 0
+ vst1.32 {d21[1]}, [r4]
+
+ mov r4, r5
+
+ MEMACCESS 0
+ vst1.32 {d18[0]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d18[1]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d19[0]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d19[1]}, [r4], r6
+
+ add r4, r5, #4
+ MEMACCESS 0
+ vst1.32 {d22[0]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d22[1]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d23[0]}, [r4], r6
+ MEMACCESS 0
+ vst1.32 {d23[1]}, [r4]
+
+ add r0, #4*2 ; src += 4 * 2
+ add r2, r2, r3, lsl #2 ; dst_a += 4 * dst_stride_a
+ add r5, r5, r6, lsl #2 ; dst_b += 4 * dst_stride_b
+ subs r7, #4 ; w -= 4
+ beq %f4
+
+ ; some residual, check to see if it includes a 2x8 block,
+ ; or less
+ cmp r7, #2
+ blt %f3
+
+ ; 2x8 block
+2
+ mov r4, r0
+ MEMACCESS 0
+ vld2.16 {d0[0], d2[0]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d1[0], d3[0]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d0[1], d2[1]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d1[1], d3[1]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d0[2], d2[2]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d1[2], d3[2]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d0[3], d2[3]}, [r4], r1
+ MEMACCESS 0
+ vld2.16 {d1[3], d3[3]}, [r4]
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+
+ mov r4, r2
+
+ MEMACCESS 0
+ vst1.64 {d0}, [r4], r3
+ MEMACCESS 0
+ vst1.64 {d2}, [r4]
+
+ mov r4, r5
+
+ MEMACCESS 0
+ vst1.64 {d1}, [r4], r6
+ MEMACCESS 0
+ vst1.64 {d3}, [r4]
+
+ add r0, #2*2 ; src += 2 * 2
+ add r2, r2, r3, lsl #1 ; dst_a += 2 * dst_stride_a
+ add r5, r5, r6, lsl #1 ; dst_b += 2 * dst_stride_b
+ subs r7, #2 ; w -= 2
+ beq %f4
+
+ ; 1x8 block
+3
+ MEMACCESS 1
+ vld2.8 {d0[0], d1[0]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[1], d1[1]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[2], d1[2]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[3], d1[3]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[4], d1[4]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[5], d1[5]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[6], d1[6]}, [r0], r1
+ MEMACCESS 1
+ vld2.8 {d0[7], d1[7]}, [r0]
+
+ MEMACCESS(3)
+ vst1.64 {d0}, [r2]
+ MEMACCESS(5)
+ vst1.64 {d1}, [r5]
+4
+
+ vpop {q8, q9, q10, q11}
+ vpop {q0, q1, q2, q3}
+ pop {r4-r8}
+ bx lr
+ ENDP
+
+ END
+
+
+
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 41ec34e..5c6f938 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -18,6 +18,15 @@ namespace libyuv {
extern "C" {
#endif
+/* !!! IMPORTANT: Following methods has been ported to pure assembler to rotate_neon.asm,
+* because MS Visual Studio doesn't support inline assembler for ARM.
+*
+* ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN rotate_neon.asm
+*
+* Eventually, only pure assembler implementation should be used for all platforms
+* to avoid code duplication.
+*/
+
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
diff --git a/source/row_neon.asm b/source/row_neon.asm
new file mode 100644
index 0000000..b623358
--- /dev/null
+++ b/source/row_neon.asm
@@ -0,0 +1,3435 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ AREA |.text|, CODE, READONLY, ALIGN=2
+
+ GET source/arm_asm_macros.in
+
+ EXPORT I444ToARGBRow_NEON
+ EXPORT I422ToARGBRow_NEON
+ EXPORT I411ToARGBRow_NEON
+ EXPORT I422ToBGRARow_NEON
+ EXPORT I422ToABGRRow_NEON
+ EXPORT I422AlphaToARGBRow_NEON
+ EXPORT I422ToRGB24Row_NEON
+ EXPORT I422ToRAWRow_NEON
+ EXPORT I422ToRGBARow_NEON
+ EXPORT I422ToARGB4444Row_NEON
+ EXPORT I422ToARGB1555Row_NEON
+ EXPORT I422ToRGB565Row_NEON
+ EXPORT I400ToARGBRow_NEON
+ EXPORT J400ToARGBRow_NEON
+ EXPORT ARGBToRGB24Row_NEON
+ EXPORT RAWToRGB24Row_NEON
+ EXPORT ARGBToRAWRow_NEON
+ EXPORT ARGBToRGB565Row_NEON
+ EXPORT ARGBToARGB1555Row_NEON
+ EXPORT YUY2ToARGBRow_NEON
+ EXPORT UYVYToARGBRow_NEON
+ EXPORT ARGBToARGB4444Row_NEON
+ EXPORT NV12ToARGBRow_NEON
+ EXPORT NV21ToARGBRow_NEON
+ EXPORT NV12ToRGB565Row_NEON
+ EXPORT NV21ToRGB565Row_NEON
+ EXPORT SplitUVRow_NEON
+ EXPORT MergeUVRow_NEON
+ EXPORT SetRow_NEON
+ EXPORT CopyRow_NEON
+ EXPORT ARGBSetRow_NEON
+ EXPORT MirrorRow_NEON
+ EXPORT MirrorUVRow_NEON
+ EXPORT ARGBMirrorRow_NEON
+ EXPORT RGB24ToARGBRow_NEON
+ EXPORT RAWToARGBRow_NEON
+ EXPORT RGB565ToARGBRow_NEON
+ EXPORT ARGB1555ToARGBRow_NEON
+ EXPORT ARGB4444ToARGBRow_NEON
+ EXPORT RGBAToUVRow_NEON
+ EXPORT ABGRToUVRow_NEON
+ EXPORT ABGRToYRow_NEON
+ EXPORT RGBAToYRow_NEON
+ EXPORT RGB24ToYRow_NEON
+ EXPORT ARGB1555ToUVRow_NEON
+ EXPORT ARGB4444ToUVRow_NEON
+ EXPORT RGB565ToYRow_NEON
+ EXPORT RGB565ToUVRow_NEON
+ EXPORT ARGB1555ToYRow_NEON
+ EXPORT ARGB4444ToYRow_NEON
+ EXPORT BGRAToYRow_NEON
+ EXPORT ARGBToUV411Row_NEON
+ EXPORT ARGBToUV422Row_NEON
+ EXPORT ARGBToUV444Row_NEON
+ EXPORT YUY2ToUV422Row_NEON
+ EXPORT UYVYToUV422Row_NEON
+ EXPORT ARGBToBayerGGRow_NEON
+ EXPORT ARGBShuffleRow_NEON
+ EXPORT ARGBToUVJRow_NEON
+ EXPORT BGRAToUVRow_NEON
+ EXPORT ABGRToUVRow_NEON
+ EXPORT RGBAToUVRow_NEON
+ EXPORT ARGBExtractAlphaRow_NEON
+ EXPORT ARGBToYJRow_NEON
+ EXPORT I422ToUYVYRow_NEON
+ EXPORT I422ToYUY2Row_NEON
+ EXPORT ARGBToUVRow_NEON
+ EXPORT ARGBToYRow_NEON
+ EXPORT RAWToUVRow_NEON
+ EXPORT RAWToYRow_NEON
+ EXPORT RGB24ToUVRow_NEON
+ EXPORT UYVYToUVRow_NEON
+ EXPORT UYVYToYRow_NEON
+ EXPORT ARGBToRGB565DitherRow_NEON
+ EXPORT YUY2ToYRow_NEON
+ EXPORT YUY2ToUVRow_NEON
+ EXPORT SobelToPlaneRow_NEON
+ EXPORT SobelRow_NEON
+ EXPORT ARGBSubtractRow_NEON
+ EXPORT ARGBAddRow_NEON
+ EXPORT ARGBAttenuateRow_NEON
+ EXPORT ARGBQuantizeRow_NEON
+ EXPORT ARGBShadeRow_NEON
+ EXPORT ARGBGrayRow_NEON
+ EXPORT ARGBSepiaRow_NEON
+ EXPORT ARGBColorMatrixRow_NEON
+ EXPORT ARGBBlendRow_NEON
+ EXPORT InterpolateRow_NEON
+ EXPORT ARGBMultiplyRow_NEON
+ EXPORT SobelXRow_NEON
+ EXPORT SobelYRow_NEON
+ EXPORT SobelXYRow_NEON
+
+
+; ------- CONSTANTS ---------------------
+
+; YUV to RGB conversion constants.
+; Y contribution to R,G,B. Scale and bias.
+YG EQU 18997 ; round(1.164 * 64 * 256 * 256 / 257)
+YGB EQU 1160 ; 1.164 * 64 * 16 - adjusted for even error distribution
+
+; U and V contributions to R,G,B
+UB EQU -128 ; -min(128, round(2.018 * 64))
+UG EQU 25 ; -round(-0.391 * 64)
+VG EQU 52 ; -round(-0.813 * 64)
+VR EQU -102 ; -round(1.596 * 64)
+
+; Bias values to subtract 16 from Y and 128 from U and V.
+BB EQU UB * 128 - YGB
+BG EQU UG * 128 + VG * 128 - YGB
+BR EQU VR * 128 - YGB
+
+
+; ------- ARRAYS ------------------------
+
+kUVToRB DCB 128, 128, 128, 128, 102, 102, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0
+kUVToG DCB 25, 25, 25, 25, 52, 52, 52, 52, 0, 0, 0, 0, 0, 0, 0, 0
+kUVBiasBGR DCW BB, BG, BR, 0, 0, 0, 0, 0
+kYToRgb DCD 0x0101 * YG, 0, 0, 0
+
+; ------- MACROS ------------------------
+
+ MACRO
+ YUV422TORGB_SETUP_REG
+ adr r5, kUVToRB
+ vld1.8 {d24}, [r5]
+ adr r5, kUVToG
+ vld1.8 {d25}, [r5]
+ adr r5, kUVBiasBGR
+ vld1.16 {d26[], d27[]}, [r5]!
+ vld1.16 {d8[], d9[]}, [r5]!
+ vld1.16 {d28[], d29[]}, [r5]
+ adr r5, kYToRgb
+ vld1.32 {d30[], d31[]}, [r5]
+ MEND
+
+ ; Read 8 Y, 4 U and 4 V from 422
+ MACRO
+ READYUV422
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ MEMACCESS 1
+ vld1.32 {d2[0]}, [r1]!
+ MEMACCESS 2
+ vld1.32 {d2[1]}, [r2]!
+ MEND
+
+ ; Read 8 Y, 2 U and 2 V from 422
+ MACRO
+ READYUV411
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ MEMACCESS 1
+ vld1.16 {d2[0]}, [r1]!
+ MEMACCESS 2
+ vld1.16 {d2[1]}, [r2]!
+ vmov.u8 d3, d2
+ vzip.u8 d2, d3
+ MEND
+
+ ; Read 8 Y, 8 U and 8 V from 444
+ MACRO
+ READYUV444
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ MEMACCESS 1
+ vld1.8 {d2}, [r1]!
+ MEMACCESS 2
+ vld1.8 {d3}, [r2]!
+ vpaddl.u8 q1, q1
+ vrshrn.u16 d2, q1, #1
+ MEND
+
+ ; Read 8 Y and 4 VU from NV21
+ MACRO
+ READNV21
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ MEMACCESS 1
+ vld1.8 {d2}, [r1]!
+ vmov.u8 d3, d2 ; split odd/even uv apart
+ vuzp.u8 d3, d2
+ vtrn.u32 d2, d3
+ MEND
+
+ ; Read 8 Y, and set 4 U and 4 V to 128
+ MACRO
+ READYUV400
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ vmov.u8 d2, #128
+ MEND
+
+ ; Read 8 Y and 4 UV from NV12
+ MACRO
+ READNV12
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]!
+ MEMACCESS 1
+ vld1.8 {d2}, [r1]!
+ vmov.u8 d3, d2 ; split odd/even uv apart
+ vuzp.u8 d2, d3
+ vtrn.u32 d2, d3
+ MEND
+
+ ; Read 8 YUY2
+ MACRO
+ READYUY2
+ MEMACCESS 0
+ vld2.8 {d0, d2}, [r0]!
+ vmov.u8 d3, d2
+ vuzp.u8 d2, d3
+ vtrn.u32 d2, d3
+ MEND
+
+ ; Read 8 UYVY
+ MACRO
+ READUYVY
+ MEMACCESS 0
+ vld2.8 {d2, d3}, [r0]!
+ vmov.u8 d0, d3
+ vmov.u8 d3, d2
+ vuzp.u8 d2, d3
+ vtrn.u32 d2, d3
+ MEND
+
+ MACRO
+ ARGBTOARGB4444
+ vshr.u8 d20, d20, #4 ; B
+ vbic.32 d21, d21, d4 ; G
+ vshr.u8 d22, d22, #4 ; R
+ vbic.32 d23, d23, d4 ; A
+ vorr d0, d20, d21 ; BG
+ vorr d1, d22, d23 ; RA
+ vzip.u8 d0, d1 ; BGRA
+ MEND
+
+ MACRO
+ ARGBTOARGB1555
+ vshll.u8 q0, d23, #8 ; A
+ vshll.u8 q8, d22, #8 ; R
+ vshll.u8 q9, d21, #8 ; G
+ vshll.u8 q10, d20, #8 ; B
+ vsri.16 q0, q8, #1 ; AR
+ vsri.16 q0, q9, #6 ; ARG
+ vsri.16 q0, q10, #11 ; ARGB
+ MEND
+
+ MACRO
+ ARGBTORGB565
+ vshll.u8 q0, d22, #8 ; R
+ vshll.u8 q8, d21, #8 ; G
+ vshll.u8 q9, d20, #8 ; B
+ vsri.16 q0, q8, #5 ; RG
+ vsri.16 q0, q9, #11 ; RGB
+ MEND
+
+ MACRO
+ YUV422TORGB
+ vmull.u8 q8, d2, d24 ; u/v B/R component
+ vmull.u8 q9, d2, d25 ; u/v G component
+ vmovl.u8 q0, d0 ; Y
+ vmovl.s16 q10, d1
+ vmovl.s16 q0, d0
+ vmul.s32 q10, q10, q15
+ vmul.s32 q0, q0, q15
+ vqshrun.s32 d0, q0, #16
+ vqshrun.s32 d1, q10, #16 ; Y
+ vadd.s16 d18, d19
+ vshll.u16 q1, d16, #16 ; Replicate u * UB
+ vshll.u16 q10, d17, #16 ; Replicate v * VR
+ vshll.u16 q3, d18, #16 ; Replicate (v*VG + u*UG)
+ vaddw.u16 q1, q1, d16
+ vaddw.u16 q10, q10, d17
+ vaddw.u16 q3, q3, d18
+ vqadd.s16 q8, q0, q13 ; B */
+ vqadd.s16 q9, q0, q14 ; R */
+ vqadd.s16 q0, q0, q4 ; G */
+ vqadd.s16 q8, q8, q1 ; B */
+ vqadd.s16 q9, q9, q10 ; R */
+ vqsub.s16 q0, q0, q3 ; G */
+ vqshrun.s16 d20, q8, #6 ; B */
+ vqshrun.s16 d22, q9, #6 ; R */
+ vqshrun.s16 d21, q0, #6 ; G */
+ MEND
+
+ MACRO
+ RGB565TOARGB
+ vshrn.u16 d6, q0, #5 ; G xxGGGGGG
+ vuzp.u8 d0, d1 ; d0 xxxBBBBB RRRRRxxx
+ vshl.u8 d6, d6, #2 ; G GGGGGG00 upper 6
+ vshr.u8 d1, d1, #3 ; R 000RRRRR lower 5
+ vshl.u8 q0, q0, #3 ; B,R BBBBB000 upper 5
+ vshr.u8 q2, q0, #5 ; B,R 00000BBB lower 3
+ vorr.u8 d0, d0, d4 ; B
+ vshr.u8 d4, d6, #6 ; G 000000GG lower 2
+ vorr.u8 d2, d1, d5 ; R
+ vorr.u8 d1, d4, d6 ; G
+ MEND
+
+ MACRO
+ ARGB1555TOARGB
+ vshrn.u16 d7, q0, #8 ; A Arrrrrxx
+ vshr.u8 d6, d7, #2 ; R xxxRRRRR
+ vshrn.u16 d5, q0, #5 ; G xxxGGGGG
+ vmovn.u16 d4, q0 ; B xxxBBBBB
+ vshr.u8 d7, d7, #7 ; A 0000000A
+ vneg.s8 d7, d7 ; A AAAAAAAA upper 8
+ vshl.u8 d6, d6, #3 ; R RRRRR000 upper 5
+ vshr.u8 q1, q3, #5 ; R,A 00000RRR lower 3
+ vshl.u8 q0, q2, #3 ; B,G BBBBB000 upper 5
+ vshr.u8 q2, q0, #5 ; B,G 00000BBB lower 3
+ vorr.u8 q1, q1, q3 ; R,A
+ vorr.u8 q0, q0, q2 ; B,G
+ MEND
+
+ MACRO
+ ARGB4444TOARGB
+ vuzp.u8 d0, d1 ; d0 BG, d1 RA
+ vshl.u8 q2, q0, #4 ; B,R BBBB0000
+ vshr.u8 q1, q0, #4 ; G,A 0000GGGG
+ vshr.u8 q0, q2, #4 ; B,R 0000BBBB
+ vorr.u8 q0, q0, q2 ; B,R BBBBBBBB
+ vshl.u8 q2, q1, #4 ; G,A GGGG0000
+ vorr.u8 q1, q1, q2 ; G,A GGGGGGGG
+ vswp.u8 d1, d2 ; B,R,G,A -> B,G,R,A
+ MEND
+
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+ MACRO
+ RGBTOUV $QB, $QG, $QR
+ vmul.s16 q8, $QB , q10 ; B
+ vmls.s16 q8, $QG , q11 ; G
+ vmls.s16 q8, $QR , q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+ vmul.s16 q9, $QR , q10 ; R
+ vmls.s16 q9, $QG , q14 ; G
+ vmls.s16 q9, $QB , q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+ MEND
+
+ ; RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+ MACRO
+ RGB555TOARGB
+ vshrn.u16 d6, q0, #5 ; G xxxGGGGG
+ vuzp.u8 d0, d1 ; d0 xxxBBBBB xRRRRRxx
+ vshl.u8 d6, d6, #3 ; G GGGGG000 upper 5
+ vshr.u8 d1, d1, #2 ; R 00xRRRRR lower 5
+ vshl.u8 q0, q0, #3 ; B,R BBBBB000 upper 5
+ vshr.u8 q2, q0, #5 ; B,R 00000BBB lower 3
+ vorr.u8 d0, d0, d4 ; B
+ vshr.u8 d4, d6, #5 ; G 00000GGG lower 3
+ vorr.u8 d2, d1, d5 ; R
+ vorr.u8 d1, d4, d6 ; G
+ MEND
+
+
+; ----- METHODS ---------------------------------------
+
+I444ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV444
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d23, #255
+ MEMACCESS 3
+ vst4.8 {d20, d21, d22, d23}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d23, #255
+ MEMACCESS 3
+ vst4.8 {d20, d21, d22, d23}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422AlphaToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = const uint8* src_a
+ ; r4 = uint8* dst_argb
+ push {r5, r6}
+ ldr r5, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ MEMACCESS 3
+ vld1.8 {d23}, [r3]!
+ subs r5, r5, #8
+ MEMACCESS 4
+ vst4.8 {d20, d21, d22, d23}, [r4]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5, r6}
+ bx lr
+ ENDP
+
+I422ToRGBARow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d19, #255
+ MEMACCESS 3
+ vst4.8 {d19, d20, d21, d22}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+
+I411ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV411
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d23, #255
+ MEMACCESS 3
+ vst4.8 {d20, d21, d22, d23}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+
+I422ToBGRARow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_bgra
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vswp.u8 d20, d22
+ vmov.u8 d19, #255
+ MEMACCESS 3
+ vst4.8 {d19, d20, d21, d22}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+
+I422ToABGRRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_abgr
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vswp.u8 d20, d22
+ vmov.u8 d23, #255
+ MEMACCESS 3
+ vst4.8 {d20, d21, d22, d23}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToRGB24Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_rgb24
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ MEMACCESS 3
+ vst3.8 {d20, d21, d22}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToRAWRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_raw
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vswp.u8 d20, d22
+ MEMACCESS 3
+ vst3.8 {d20, d21, d22}, [r3]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToARGB4444Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb4444
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+ vmov.u8 d4, #0x0f ; bits to clear with vbic.
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d23, #255
+ ARGBTOARGB4444
+ MEMACCESS 3
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB4444.
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToARGB1555Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_argb1555
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ vmov.u8 d23, #255
+ ARGBTOARGB1555
+ MEMACCESS 3
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB1555.
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I422ToRGB565Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_rgb565
+ push {r4, r5}
+ ldr r4, [sp,#8] ; int width
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV422
+ YUV422TORGB
+ subs r4, r4, #8
+ ARGBTORGB565
+ MEMACCESS 3
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB1555.
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r4, r5}
+ bx lr
+ ENDP
+
+I400ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_rgb24
+ ; r2 = width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUV400
+ YUV422TORGB
+ subs r2, r2, #8
+ vmov.u8 d23, #255
+ MEMACCESS 1
+ vst4.8 {d20, d21, d22, d23}, [r1]!
+ bgt %b1
+
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+J400ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = width
+ push {r5}
+ vpush {d20 - d23}
+
+ vmov.u8 d23, #255
+1
+ MEMACCESS 0
+ vld1.8 {d20}, [r0]!
+ vmov d21, d20
+ vmov d22, d20
+ subs r2, r2, #8
+ MEMACCESS 1
+ vst4.8 {d20, d21, d22, d23}, [r1]!
+ bgt %b1
+
+
+ vpop {d20 - d23}
+ pop {r5}
+ bx lr
+ ENDP
+
+
+ARGBToRGB24Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_raw
+ ; r2 = pix
+ vpush {d1 - d4}
+
+1
+ MEMACCESS 0
+ vld4.8 {d1, d2, d3, d4}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ MEMACCESS 1
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of RGB24.
+ bgt %b1
+
+ vpop {d1 - d4}
+ bx lr
+ ENDP
+
+ARGBToRAWRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = pix
+ vpush {d1 - d4}
+
+1
+ MEMACCESS 0
+ vld4.8 {d1, d2, d3, d4}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vswp.u8 d1, d3 ; swap R, B
+ MEMACCESS 1
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of RAW.
+ bgt %b1
+
+ vpop {d1 - d4}
+ bx lr
+ ENDP
+
+ARGBToRGB565Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_rgb565
+ ; r2 = pix
+ vpush {q0}
+ vpush {q8 - q11}
+
+1
+ MEMACCESS 0
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGBTORGB565
+ MEMACCESS 1
+ vst1.8 {q0}, [r1]! ; store 8 pixels RGB565.
+ bgt %b1
+
+ vpop {q8 - q11}
+ vpop {q0}
+ bx lr
+ ENDP
+
+ARGBToARGB1555Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb1555
+ ; r2 = pix
+ vpush {q0}
+ vpush {q8 - q11}
+
+1
+ MEMACCESS 0
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGBTOARGB1555
+ MEMACCESS 1
+ vst1.8 {q0}, [r1]! ; store 8 pixels ARGB1555.
+ bgt %b1
+
+ vpop {q8 - q11}
+ vpop {q0}
+ bx lr
+ ENDP
+
+ARGBToARGB4444Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb4444
+ ; r2 = pix
+ vpush {q0}
+ vpush {q8 - q11}
+
+ vmov.u8 d4, #0x0f ; bits to clear with vbic.
+1
+ MEMACCESS 0
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGBTOARGB4444
+ MEMACCESS 1
+ vst1.8 {q0}, [r1]! ; store 8 pixels ARGB4444.
+ bgt %b1
+
+ vpop {q8 - q11}
+ vpop {q0}
+ bx lr
+ ENDP
+
+NV12ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_uv
+ ; r2 = uint8* dst_argb
+ ; r3 = int width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+
+1
+ READNV12
+ YUV422TORGB
+ subs r3, r3, #8
+ vmov.u8 d23, #255
+ MEMACCESS 2
+ vst4.8 {d20, d21, d22, d23}, [r2]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+NV21ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_uv
+ ; r2 = uint8* dst_argb
+ ; r3 = int width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+
+1
+ READNV21
+ YUV422TORGB
+ subs r3, r3, #8
+ vmov.u8 d23, #255
+ MEMACCESS 2
+ vst4.8 {d20, d21, d22, d23}, [r2]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+NV12ToRGB565Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_uv
+ ; r2 = uint8* dst_rgb565
+ ; r3 = int width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+
+1
+ READNV12
+ YUV422TORGB
+ subs r3, r3, #8
+ ARGBTORGB565
+ MEMACCESS 2
+ vst1.8 {q0}, [r2]! ; store 8 pixels RGB565.
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+NV21ToRGB565Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_uv
+ ; r2 = uint8* dst_rgb565
+ ; r3 = int width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+
+1
+ READNV21
+ YUV422TORGB
+ subs r3, r3, #8
+ ARGBTORGB565
+ MEMACCESS 2
+ vst1.8 {q0}, [r2]! ; store 8 pixels RGB565.
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+YUY2ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_yuy2
+ ; r1 = uint8* dst_argb
+ ; r2 = width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READYUY2
+ YUV422TORGB
+ subs r2, r2, #8
+ vmov.u8 d23, #255
+ MEMACCESS 1
+ vst4.8 {d20, d21, d22, d23}, [r1]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+UYVYToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uyvy
+ ; r1 = uint8* dst_argb
+ ; r2 = width
+ push {r5}
+ vpush {q0 - q4}
+ vpush {q8 - q15}
+
+ YUV422TORGB_SETUP_REG
+1
+ READUYVY
+ YUV422TORGB
+ subs r2, r2, #8
+ vmov.u8 d23, #255
+ MEMACCESS 1
+ vst4.8 {d20, d21, d22, d23}, [r1]!
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q4}
+ pop {r5}
+ bx lr
+ ENDP
+
+; Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+SplitUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uv
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int width
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld2.8 {q0, q1}, [r0]! ; load 16 pairs of UV
+ subs r3, r3, #16 ; 16 processed per loop
+ MEMACCESS 1
+ vst1.8 {q0}, [r1]! ; store U
+ MEMACCESS 2
+ vst1.8 {q1}, [r2]! ; store V
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+; Reads 16 U's and V's and writes out 16 pairs of UV
+MergeUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_u
+ ; r1 = uint8* src_v
+ ; r2 = uint8* dst_uv
+ ; r3 = int width
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load U
+ MEMACCESS 1
+ vld1.8 {q1}, [r1]! ; load V
+ subs r3, r3, #16 ; 16 processed per loop
+ MEMACCESS 2
+ vst2.u8 {q0, q1}, [r2]! ; store 16 pairs of UV
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+; Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+CopyRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src
+ ; r1 = uint8* dst
+ ; r2 = int count
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld1.8 {d0, d1, d2, d3}, [r0]! ; load 32
+ subs r2, r2, #32 ; 32 processed per loop
+ MEMACCESS 1
+ vst1.8 {d0, d1, d2, d3}, [r1]! ; store 32
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+; SetRow writes 'count' bytes using an 8 bit value repeated
+SetRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src
+ ; r1 = uint8* v8
+ ; r2 = int count
+ vpush {q0}
+
+ vdup.8 q0, r1 ; duplicate 16 bytes
+1
+ subs r2, r2, #16 ; 16 bytes per loop
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]! ; store
+ bgt %b1
+
+ vpop {q0}
+ bx lr
+ ENDP
+
+; ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+ARGBSetRow_NEON PROC
+ ; input
+ ; r0 = const uint8* dst
+ ; r1 = uint8* v32
+ ; r2 = int count
+ vpush {q0}
+
+ vdup.u32 q0, r1 ; duplicate 4 ints
+1
+ subs r2, r2, #4 ; 4 pixels per loop
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]! ; store
+ bgt %b1
+
+ vpop {q0}
+ bx lr
+ ENDP
+
+MirrorRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src
+ ; r1 = uint8* dst
+ ; r2 = int width
+ push {r3}
+ vpush {q0}
+ ; Start at end of source row.
+ mov r3, #-16
+ add r0, r0, r2
+ sub r0, #16
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0], r3 ; src -= 16
+ subs r2, #16 ; 16 pixels per loop.
+ vrev64.8 q0, q0
+ MEMACCESS 1
+ vst1.8 {d1}, [r1]! ; dst += 16
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]!
+ bgt %b1
+
+ vpop {q0}
+ pop {r3}
+ bx lr
+ ENDP
+
+
+MirrorUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uv
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_
+ ; r3 = uint8* width
+ push {r12}
+ vpush {q0}
+ ; Start at end of source row.
+ mov r12, #-16
+ add r0, r0, r3, lsl #1
+ sub r0, #16
+
+1
+ MEMACCESS 0
+ vld2.8 {d0, d1}, [r0], r12 ; src -= 16
+ subs r3, #8 ; 8 pixels per loop.
+ vrev64.8 q0, q0
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; dst += 8
+ MEMACCESS 2
+ vst1.8 {d1}, [r2]!
+ bgt %b1
+
+ vpop {q0}
+ pop {r12}
+ bx lr
+ ENDP
+
+ARGBMirrorRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src
+ ; r1 = uint8* dst
+ ; r2 = int width
+ push {r3}
+ vpush {q0}
+
+ ; Start at end of source row.
+ mov r3, #-16
+ add r0, r0, r2, lsl #2
+ sub r0, #16
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0], r3 ; src -= 16
+ subs r2, #4 ; 4 pixels per loop.
+ vrev64.32 q0, q0
+ MEMACCESS 1
+ vst1.8 {d1}, [r1]! ; dst += 16
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]!
+ bgt %b1
+
+ vpop {q0}
+ pop {r3}
+ bx lr
+ ENDP
+
+
+RGB24ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb24
+ ; r1 = uint8* dst_argb
+ ; r2 = int pix
+ vpush {d1 - d4}
+ vmov.u8 d4, #255 ; Alpha
+
+1
+ MEMACCESS 0
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RGB24.
+ subs r2, r2, #8 ; 8 processed per loop.
+ MEMACCESS 1
+ vst4.8 {d1, d2, d3, d4}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {d1 - d4}
+ bx lr
+ ENDP
+
+RAWToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_raw
+ ; r1 = uint8* dst_argb
+ ; r2 = int pix
+ vpush {d1 - d4}
+ vmov.u8 d4, #255 ; Alpha
+
+1
+ MEMACCESS 0
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RAW.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vswp.u8 d1, d3 ; swap R, B
+ MEMACCESS 1
+ vst4.8 {d1, d2, d3, d4}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {d1 - d4}
+ bx lr
+ ENDP
+
+RAWToRGB24Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_raw
+ ; r1 = uint8* dst_rgb24
+ ; r2 = int width
+ vpush {d1 - d4}
+
+1
+ MEMACCESS 0
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RAW.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vswp.u8 d1, d3 ; swap R, B
+ MEMACCESS 1
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of b g r.
+ bgt %b1
+
+ vpop {d1 - d4}
+ bx lr
+ ENDP
+
+RGB565ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb565
+ ; r1 = uint8* dst_argb
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vmov.u8 d3, #255 ; Alpha
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ RGB565TOARGB
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ARGB1555ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb1555
+ ; r1 = uint8* dst_argb
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vmov.u8 d3, #255 ; Alpha
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGB1555TOARGB
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ARGB4444ToARGBRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb4444
+ ; r1 = uint8* dst_argb
+ ; r2 = int pix
+ vpush {q0 - q2}
+ vmov.u8 d3, #255 ; Alpha
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGB4444TOARGB
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q0 - q2}
+ bx lr
+ ENDP
+
+ABGRToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_abgr
+ ; r1 = uint8* src_stride_abgr
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int witdh
+ vpush {q0 - q7}
+ vpush {q7 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_abgr
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ABGR pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ABGR pixels.
+ vpaddl.u8 q2, q2 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q0, q0 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ABGR pixels.
+ MEMACCESS 1
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ABGR pixels.
+ vpadal.u8 q2, q6 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q0, q4 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q2, q1, q0
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q7 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+RGBAToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgba
+ ; r1 = uint8* src_stride_rgba
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q7 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_rgba
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 RGBA pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 RGBA pixels.
+ vpaddl.u8 q0, q1 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q2 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q3 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more RGBA pixels.
+ MEMACCESS 1
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 RGBA pixels.
+ vpadal.u8 q0, q5 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q6 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q7 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q0, q1, q2
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q7 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+ABGRToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_abgr
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {d0 - d7}
+ vpush {q8}
+
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient
+ vmov.u8 d7, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ABGR.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q8, d0, d4 ; R
+ vmlal.u8 q8, d1, d5 ; G
+ vmlal.u8 q8, d2, d6 ; B
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d7
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q8}
+ vpop {d0 - d7}
+ bx lr
+ ENDP
+
+RGBAToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgba
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {d0 - d7}
+ vpush {q8}
+
+ vmov.u8 d4, #13 ; B * 0.1016 coefficient
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient
+ vmov.u8 d6, #33 ; R * 0.2578 coefficient
+ vmov.u8 d7, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of RGBA.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q8, d1, d4 ; B
+ vmlal.u8 q8, d2, d5 ; G
+ vmlal.u8 q8, d3, d6 ; R
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d7
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q8}
+ vpop {d0 - d7}
+ bx lr
+ ENDP
+
+RGB24ToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb24
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {d0 - d7}
+ vpush {q8}
+
+ vmov.u8 d4, #13 ; B * 0.1016 coefficient
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient
+ vmov.u8 d6, #33 ; R * 0.2578 coefficient
+ vmov.u8 d7, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld3.8 {d0, d1, d2}, [r0]! ; load 8 pixels of RGB24.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q8, d0, d4 ; B
+ vmlal.u8 q8, d1, d5 ; G
+ vmlal.u8 q8, d2, d6 ; R
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d7
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q8}
+ vpop {d0 - d7}
+ bx lr
+ ENDP
+
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+ARGB1555ToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb1555
+ ; r1 = uint8* src_stride_argb1555
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_argb
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ vrshr.u16 q4, q4, #1 ; 2x average
+ vrshr.u16 q5, q5, #1
+ vrshr.u16 q6, q6, #1
+
+ subs r4, r4, #16 ; 16 processed per loop.
+ vmul.s16 q8, q4, q10 ; B
+ vmls.s16 q8, q5, q11 ; G
+ vmls.s16 q8, q6, q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+ vmul.s16 q9, q6, q10 ; R
+ vmls.s16 q9, q5, q14 ; G
+ vmls.s16 q9, q4, q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+
+; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+ARGB4444ToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb4444
+ ; r1 = uint8* src_stride_argb4444
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_argb
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ vrshr.u16 q4, q4, #1 ; 2x average
+ vrshr.u16 q5, q5, #1
+ vrshr.u16 q6, q6, #1
+
+ subs r4, r4, #16 ; 16 processed per loop.
+ vmul.s16 q8, q4, q10 ; B
+ vmls.s16 q8, q5, q11 ; G
+ vmls.s16 q8, q6, q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+ vmul.s16 q9, q6, q10 ; R
+ vmls.s16 q9, q5, q14 ; G
+ vmls.s16 q9, q4, q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+RGB565ToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb565
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vpush {q12 - q13}
+
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient
+ vmov.u8 d27, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ RGB565TOARGB
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d27
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+RGB565ToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb565
+ ; r1 = uint8* src_stride_rgb565
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_argb
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels.
+ RGB565TOARGB
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; next 8 RGB565 pixels.
+ RGB565TOARGB
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; load 8 RGB565 pixels.
+ RGB565TOARGB
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts.
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]! ; next 8 RGB565 pixels.
+ RGB565TOARGB
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts.
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts.
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts.
+
+ vrshr.u16 q4, q4, #1 ; 2x average
+ vrshr.u16 q5, q5, #1
+ vrshr.u16 q6, q6, #1
+
+ subs r4, r4, #16 ; 16 processed per loop.
+ vmul.s16 q8, q4, q10 ; B
+ vmls.s16 q8, q5, q11 ; G
+ vmls.s16 q8, q6, q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+ vmul.s16 q9, q6, q10 ; R
+ vmls.s16 q9, q5, q14 ; G
+ vmls.s16 q9, q4, q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+ARGB1555ToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb1555
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vpush {q12 - q13}
+
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient
+ vmov.u8 d27, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGB1555TOARGB
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d27
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ARGB4444ToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb4444
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vpush {q12 - q13}
+
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient
+ vmov.u8 d27, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ ARGB4444TOARGB
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d27
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+BGRAToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_bgra
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q3}
+ vpush {q12 - q13}
+
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient
+ vmov.u8 d7, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of BGRA.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q8, d1, d4 ; R
+ vmlal.u8 q8, d2, d5 ; G
+ vmlal.u8 q8, d3, d6 ; B
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d7
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+ARGBToUV411Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels.
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 0
+ vld4.8 {d8, d10, d12, d14}, [r0]! ; load 8 more ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d9, d11, d13, d15}, [r0]! ; load last 8 ARGB pixels.
+ vpaddl.u8 q4, q4 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q5, q5 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q6, q6 ; R 16 bytes -> 8 shorts.
+
+ vpadd.u16 d0, d0, d1 ; B 16 shorts -> 8 shorts.
+ vpadd.u16 d1, d8, d9 ; B
+ vpadd.u16 d2, d2, d3 ; G 16 shorts -> 8 shorts.
+ vpadd.u16 d3, d10, d11 ; G
+ vpadd.u16 d4, d4, d5 ; R 16 shorts -> 8 shorts.
+ vpadd.u16 d5, d12, d13 ; R
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r3, r3, #32 ; 32 processed per loop.
+ vmul.s16 q8, q0, q10 ; B
+ vmls.s16 q8, q1, q11 ; G
+ vmls.s16 q8, q2, q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+ vmul.s16 q9, q2, q10 ; R
+ vmls.s16 q9, q1, q14 ; G
+ vmls.s16 q9, q0, q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels U.
+ MEMACCESS 2
+ vst1.8 {d1}, [r2]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ bx lr
+ ENDP
+
+ARGBToUV422Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels.
+
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+
+ subs r3, r3, #16 ; 16 processed per loop.
+ vmul.s16 q8, q0, q10 ; B
+ vmls.s16 q8, q1, q11 ; G
+ vmls.s16 q8, q2, q12 ; R
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned
+
+ vmul.s16 q9, q2, q10 ; R
+ vmls.s16 q9, q1, q14 ; G
+ vmls.s16 q9, q0, q13 ; B
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned
+
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V
+
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels U.
+ MEMACCESS 2
+ vst1.8 {d1}, [r2]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ bx lr
+ ENDP
+
+ ; 8x1 pixels.
+ARGBToUV444Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int pix
+ vpush {q0 - q4}
+ vpush {q12 - q15}
+
+ vmov.u8 d24, #112 ; UB / VR 0.875 coefficient
+ vmov.u8 d25, #74 ; UG -0.5781 coefficient
+ vmov.u8 d26, #38 ; UR -0.2969 coefficient
+ vmov.u8 d27, #18 ; VB -0.1406 coefficient
+ vmov.u8 d28, #94 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vmull.u8 q2, d0, d24 ; B
+ vmlsl.u8 q2, d1, d25 ; G
+ vmlsl.u8 q2, d2, d26 ; R
+ vadd.u16 q2, q2, q15 ; +128 -> unsigned
+
+ vmull.u8 q3, d2, d24 ; R
+ vmlsl.u8 q3, d1, d28 ; G
+ vmlsl.u8 q3, d0, d27 ; B
+ vadd.u16 q3, q3, q15 ; +128 -> unsigned
+
+ vqshrn.u16 d0, q2, #8 ; 16 bit to 8 bit U
+ vqshrn.u16 d1, q3, #8 ; 16 bit to 8 bit V
+
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels U.
+ MEMACCESS 2
+ vst1.8 {d1}, [r2]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q12 - q15}
+ vpop {q0 - q4}
+ bx lr
+ ENDP
+
+YUY2ToUV422Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_yuy2
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int pix
+ vpush {d0 - d3}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of YUY2.
+ subs r3, r3, #16 ; 16 pixels = 8 UVs.
+ MEMACCESS 1
+ vst1.8 {d1}, [r1]! ; store 8 U.
+ MEMACCESS 2
+ vst1.8 {d3}, [r2]! ; store 8 V.
+ bgt %b1
+
+ vpop {d0 - d3}
+ bx lr
+ ENDP
+
+
+UYVYToUV422Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uyvy
+ ; r1 = uint8* dst_u
+ ; r2 = uint8* dst_v
+ ; r3 = int pix
+ vpush {d0 - d3}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of UYVY.
+ subs r3, r3, #16 ; 16 pixels = 8 UVs.
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 U.
+ MEMACCESS 2
+ vst1.8 {d2}, [r2]! ; store 8 V.
+ bgt %b1
+
+ vpop {d0 - d3}
+ bx lr
+ ENDP
+
+ ; Select G channels from ARGB. e.g. GGGGGGGG
+ARGBToBayerGGRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_bayer
+ ; r2 = uint32 selector
+ ; r3 = int pix
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load row 8 pixels.
+ subs r3, r3, #8 ; 8 processed per loop
+ MEMACCESS 1
+ vst1.8 {d1}, [r1]! ; store 8 G's.
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+; For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+ARGBShuffleRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = const uint8* shuffler
+ ; r3 = int pix
+ vpush {q0 - q2}
+
+ MEMACCESS 3
+ vld1.8 {q2}, [r2] ; shuffler
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 4 pixels.
+ subs r3, r3, #4 ; 4 processed per loop
+ vtbl.8 d2, {d0, d1}, d4 ; look up 2 first pixels
+ vtbl.8 d3, {d0, d1}, d5 ; look up 2 next pixels
+ MEMACCESS 1
+ vst1.8 {q1}, [r1]! ; store 4.
+ bgt %b1
+
+ vpop {q0 - q2}
+ bx lr
+ ENDP
+
+ ; TODO(fbarchard): Subsample match C code.
+ARGBToUVJRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = int src_stride_argb
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_argb
+ vmov.s16 q10, #127 / 2 ; UB / VR 0.500 coefficient
+ vmov.s16 q11, #84 / 2 ; UG -0.33126 coefficient
+ vmov.s16 q12, #43 / 2 ; UR -0.16874 coefficient
+ vmov.s16 q13, #20 / 2 ; VB -0.08131 coefficient
+ vmov.s16 q14, #107 / 2 ; VG -0.41869 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels.
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ARGB pixels.
+ MEMACCESS 1
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ARGB pixels.
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q0, q1, q2
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+
+BGRAToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_bgra
+ ; r1 = int src_stride_bgra
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_bgra
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 BGRA pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 BGRA pixels.
+ vpaddl.u8 q3, q3 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more BGRA pixels.
+ MEMACCESS 1
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 BGRA pixels.
+ vpadal.u8 q3, q7 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q6 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q1, q1, #1 ; 2x average
+ vrshr.u16 q2, q2, #1
+ vrshr.u16 q3, q3, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q3, q2, q1
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+ARGBExtractAlphaRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q3}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels
+ subs r2, r2, #16 ; 16 processed per loop
+ MEMACCESS 1
+ vst1.8 {q3}, [r1]! ; store 16 A's.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ARGBToYJRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0 - q2}
+ vpush {q12 - q13}
+
+ vmov.u8 d24, #15 ; B * 0.11400 coefficient
+ vmov.u8 d25, #75 ; G * 0.58700 coefficient
+ vmov.u8 d26, #38 ; R * 0.29900 coefficient
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 15 bit to 8 bit Y
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q2}
+ bx lr
+ ENDP
+
+I422ToYUY2Row_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_yuy2
+ push {r4}
+ ldr r4, [sp,#4] ; int width
+ vpush {d0 - d3}
+
+1
+ MEMACCESS 0
+ vld2.8 {d0, d2}, [r0]! ; load 16 Ys
+ MEMACCESS 1
+ vld1.8 {d1}, [r1]! ; load 8 Us
+ MEMACCESS 2
+ vld1.8 {d3}, [r2]! ; load 8 Vs
+ subs r4, r4, #16 ; 16 pixels
+ MEMACCESS 3
+ vst4.8 {d0, d1, d2, d3}, [r3]! ; Store 8 YUY2/16 pixels.
+ bgt %b1
+
+ vpop {d0 - d3}
+ pop {r4}
+ bx lr
+ ENDP
+
+I422ToUYVYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y
+ ; r1 = const uint8* src_u
+ ; r2 = const uint8* src_v
+ ; r3 = uint8* dst_uyvy
+ push {r4}
+ ldr r4, [sp,#4] ; int width
+ vpush {d0 - d3}
+
+1
+ MEMACCESS 0
+ vld2.8 {d1, d3}, [r0]! ; load 16 Ys
+ MEMACCESS 1
+ vld1.8 {d0}, [r1]! ; load 8 Us
+ MEMACCESS 2
+ vld1.8 {d2}, [r2]! ; load 8 Vs
+ subs r4, r4, #16 ; 16 pixels
+ MEMACCESS 3
+ vst4.8 {d0, d1, d2, d3}, [r3]! ; Store 8 UYVY/16 pixels.
+ bgt %b1
+
+ vpop {d0 - d3}
+ pop {r4}
+ bx lr
+ ENDP
+
+ ; TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, ashr.
+ARGBToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = int src_stride_argb
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_argb
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels.
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ARGB pixels.
+ MEMACCESS 1
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ARGB pixels.
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q0, q1, q2
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+ARGBToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_bgra
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0-q2}
+ vpush {q12,q13}
+
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient
+ vmov.u8 d27, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d27
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q12, q13}
+ vpop {q0-q2}
+ bx lr
+ ENDP
+
+RAWToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_raw
+ ; r1 = int src_stride_raw
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_raw
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld3.8 {d0, d2, d4}, [r0]! ; load 8 RAW pixels.
+ MEMACCESS 0
+ vld3.8 {d1, d3, d5}, [r0]! ; load next 8 RAW pixels.
+ vpaddl.u8 q2, q2 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q0, q0 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld3.8 {d8, d10, d12}, [r1]! ; load 8 more RAW pixels.
+ MEMACCESS 1
+ vld3.8 {d9, d11, d13}, [r1]! ; load last 8 RAW pixels.
+ vpadal.u8 q2, q6 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q0, q4 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q2, q1, q0
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+RAWToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_raw
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {d0 - d7}
+ vpush {q8}
+
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient
+ vmov.u8 d7, #16 ; Add 16 constant
+
+1
+ MEMACCESS 0
+ vld3.8 {d0, d1, d2}, [r0]! ; load 8 pixels of RAW.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q8, d0, d4 ; B
+ vmlal.u8 q8, d1, d5 ; G
+ vmlal.u8 q8, d2, d6 ; R
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y
+ vqadd.u8 d0, d7
+ MEMACCESS 1
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y.
+ bgt %b1
+
+ vpop {q8}
+ vpop {d0-d7}
+ bx lr
+ ENDP
+
+
+RGB24ToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_rgb24
+ ; r1 = int src_stride_rgb24
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; src_stride + src_rgb24
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient
+ vmov.u16 q15, #0x8080 ; 128.5
+
+1
+ MEMACCESS 0
+ vld3.8 {d0, d2, d4}, [r0]! ; load 8 RGB24 pixels.
+ MEMACCESS 0
+ vld3.8 {d1, d3, d5}, [r0]! ; load next 8 RGB24 pixels.
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld3.8 {d8, d10, d12}, [r1]! ; load 8 more RGB24 pixels.
+ MEMACCESS 1
+ vld3.8 {d9, d11, d13}, [r1]! ; load last 8 RGB24 pixels.
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts.
+
+ vrshr.u16 q0, q0, #1 ; 2x average
+ vrshr.u16 q1, q1, #1
+ vrshr.u16 q2, q2, #1
+
+ subs r4, r4, #16 ; 32 processed per loop.
+ RGBTOUV q0, q1, q2
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 pixels U.
+ MEMACCESS 3
+ vst1.8 {d1}, [r3]! ; store 8 pixels V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+UYVYToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uyvy
+ ; r1 = int stride_uyvy
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ add r1, r0, r1 ; stride + src_uyvy
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of UYVY.
+ subs r4, r4, #16 ; 16 pixels = 8 UVs.
+ MEMACCESS 1
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load next row UYVY.
+ vrhadd.u8 d0, d0, d4 ; average rows of U
+ vrhadd.u8 d2, d2, d6 ; average rows of V
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 U.
+ MEMACCESS 3
+ vst1.8 {d2}, [r3]! ; store 8 V.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ pop {r4}
+ bx lr
+ ENDP
+
+UYVYToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_uyvy
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld2.8 {q0, q1}, [r0]! ; load 16 pixels of UYVY.
+ subs r2, r2, #16 ; 16 processed per loop.
+ MEMACCESS 1
+ vst1.8 {q1}, [r1]! ; store 16 pixels of Y.
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+YUY2ToUVRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_yuy2
+ ; r1 = int stride_yuy2
+ ; r2 = uint8* dst_u
+ ; r3 = uint8* dst_v
+ push {r4}
+ ldr r4, [sp,#4] ; int pix
+ vpush {d0 - d7}
+
+ add r1, r0, r1 ; stride + src_yuy2
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of YUY2.
+ subs r4, r4, #16 ; 16 pixels = 8 UVs.
+ MEMACCESS 1
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load next row YUY2.
+ vrhadd.u8 d1, d1, d5 ; average rows of U
+ vrhadd.u8 d3, d3, d7 ; average rows of V
+ MEMACCESS 2
+ vst1.8 {d1}, [r2]! ; store 8 U.
+ MEMACCESS 3
+ vst1.8 {d3}, [r3]! ; store 8 V.
+ bgt %b1
+
+ vpop {d0 - d7}
+ pop {r4}
+ bx lr
+ ENDP
+
+YUY2ToYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_yuy2
+ ; r1 = uint8* dst_y
+ ; r2 = int pix
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld2.8 {q0, q1}, [r0]! ; load 16 pixels of YUY2.
+ subs r2, r2, #16 ; 16 processed per loop.
+ MEMACCESS 1
+ vst1.8 {q0}, [r1]! ; store 16 pixels of Y.
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+ARGBToRGB565DitherRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_rgb
+ ; r2 = const uint32 dither4
+ ; r3 = int width
+ vpush {q0, q1}
+ vpush {q8-q11}
+
+ vdup.32 d2, r2 ; dither4
+1
+ MEMACCESS 1
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vqadd.u8 d20, d20, d2
+ vqadd.u8 d21, d21, d2
+ vqadd.u8 d22, d22, d2
+ ARGBTORGB565
+ MEMACCESS 0
+ vst1.8 {q0}, [r1]! ; store 8 pixels RGB565.
+ bgt %b1
+
+ vpop {q8-q11}
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+; Add 2 rows of ARGB pixels together, 8 pixels at a time.
+ARGBAddRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb0
+ ; r1 = uint8* src_argb1
+ ; r2 = uint8* dst_arg
+ ; r3 = int width
+ vpush {q0 - q3}
+ ; 8 pixel loop.
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 1
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 more ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vqadd.u8 q0, q0, q2 ; add B, G
+ vqadd.u8 q1, q1, q3 ; add R, A
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+ARGBSubtractRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb0
+ ; r1 = uint8* src_argb1
+ ; r2 = uint8* dst_arg
+ ; r3 = int width
+ vpush {q0 - q3}
+ ; 8 pixel loop.
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 1
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 more ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vqsub.u8 q0, q0, q2 ; subtract B, G
+ vqsub.u8 q1, q1, q3 ; subtract R, A
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+; A = 255
+; R = Sobel
+; G = Sobel
+; B = Sobel
+SobelRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_sobelx
+ ; r1 = const uint8* src_sobely
+ ; r2 = uint8* dst_argb
+ ; r3 = int width
+ vpush {q0 - q1}
+ vmov.u8 d3, #255 ; alpha
+ ; 8 pixel loop.
+
+1
+ MEMACCESS 0
+ vld1.8 {d0}, [r0]! ; load 8 sobelx.
+ MEMACCESS 1
+ vld1.8 {d1}, [r1]! ; load 8 sobely.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vqadd.u8 d0, d0, d1 ; add
+ vmov.u8 d1, d0
+ vmov.u8 d2, d0
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q0 - q1}
+ bx lr
+ ENDP
+
+; Adds Sobel X and Sobel Y and stores Sobel into plane.
+SobelToPlaneRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_sobelx
+ ; r1 = const uint8* src_sobely
+ ; r2 = uint8* dst_y
+ ; r3 = int width
+ vpush {q0 - q1}
+ ; 16 pixel loop.
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load 16 sobelx.
+ MEMACCESS 1
+ vld1.8 {q1}, [r1]! ; load 16 sobely.
+ subs r3, r3, #16 ; 16 processed per loop.
+ vqadd.u8 q0, q0, q1 ; add
+ MEMACCESS 2
+ vst1.8 {q0}, [r2]! ; store 16 pixels.
+ bgt %b1
+
+ vpop {q0 - q1}
+ bx lr
+ ENDP
+
+; Attenuate 8 pixels at a time.
+ARGBAttenuateRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = const uint8* dst_argb
+ ; r2 = int width
+ vpush {q0 - q1}
+ vpush {q10 - q12}
+
+ ; Attenuate 8 pixels.
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q10, d0, d3 ; b * a
+ vmull.u8 q11, d1, d3 ; g * a
+ vmull.u8 q12, d2, d3 ; r * a
+ vqrshrn.u16 d0, q10, #8 ; b >>= 8
+ vqrshrn.u16 d1, q11, #8 ; g >>= 8
+ vqrshrn.u16 d2, q12, #8 ; r >>= 8
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q10 - q12}
+ vpop {q0 - q1}
+ bx lr
+ ENDP
+
+; Quantize 8 ARGB pixels (32 bytes).
+; dst = (dst * scale >> 16) * interval_size + interval_offset;
+ARGBQuantizeRow_NEON PROC
+ ; input
+ ; r0 = uint8* dst_argb
+ ; r1 = int scale
+ ; r2 = int interval_size
+ ; r3 = int interval_offset
+ push {r2 - r4}
+ ldr r4, [sp,#12] ; int width
+ vpush {q0 - q3}
+ vpush {q8 - q10}
+
+ vdup.u16 q8, r1
+ vshr.u16 q8, q8, #1 ; scale >>= 1
+ vdup.u16 q9, r2 ; interval multiply.
+ vdup.u16 q10, r3 ; interval add
+
+ ; 8 pixel loop.
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0] ; load 8 pixels of ARGB.
+ subs r4, r4, #8 ; 8 processed per loop.
+ vmovl.u8 q0, d0 ; b (0 .. 255)
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vqdmulh.s16 q0, q0, q8 ; b * scale
+ vqdmulh.s16 q1, q1, q8 ; g
+ vqdmulh.s16 q2, q2, q8 ; r
+ vmul.u16 q0, q0, q9 ; b * interval_size
+ vmul.u16 q1, q1, q9 ; g
+ vmul.u16 q2, q2, q9 ; r
+ vadd.u16 q0, q0, q10 ; b + interval_offset
+ vadd.u16 q1, q1, q10 ; g
+ vadd.u16 q2, q2, q10 ; r
+ vqmovn.u16 d0, q0
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d4, q2
+ MEMACCESS 0
+ vst4.8 {d0, d2, d4, d6}, [r0]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q8 - q10}
+ vpop {q0 - q3}
+ pop {r2 - r4}
+ bx lr
+ ENDP
+
+; Shade 8 pixels at a time by specified value.
+; NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+; Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+ARGBShadeRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = int width
+ ; r3 = int value
+ vpush {q0}
+ vpush {q10 - q13}
+
+ vdup.u32 q0, r3 ; duplicate scale value.
+ vzip.u8 d0, d1 ; d0 aarrggbb.
+ vshr.u16 q0, q0, #1 ; scale / 2.
+
+ ; 8 pixel loop.
+1
+ MEMACCESS 0
+ vld4.8 {d20, d22, d24, d26}, [r0]! ; load 8 pixels of ARGB.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmovl.u8 q10, d20 ; b (0 .. 255)
+ vmovl.u8 q11, d22
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+ vqrdmulh.s16 q10, q10, d0[0] ; b * scale * 2
+ vqrdmulh.s16 q11, q11, d0[1] ; g
+ vqrdmulh.s16 q12, q12, d0[2] ; r
+ vqrdmulh.s16 q13, q13, d0[3] ; a
+ vqmovn.u16 d20, q10
+ vqmovn.u16 d22, q11
+ vqmovn.u16 d24, q12
+ vqmovn.u16 d26, q13
+ MEMACCESS 1
+ vst4.8 {d20, d22, d24, d26}, [r1]! ; store 8 pixels of ARGB.
+ bgt %b1
+
+ vpop {q10 - q13}
+ vpop {q0}
+ bx lr
+ ENDP
+
+; Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+; Similar to ARGBToYJ but stores ARGB.
+; C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+ARGBGrayRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = int width
+ vpush {q0- q2}
+ vpush {q12 - q13}
+
+ vmov.u8 d24, #15 ; B * 0.11400 coefficient
+ vmov.u8 d25, #75 ; G * 0.58700 coefficient
+ vmov.u8 d26, #38 ; R * 0.29900 coefficient
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels.
+ subs r2, r2, #8 ; 8 processed per loop.
+ vmull.u8 q2, d0, d24 ; B
+ vmlal.u8 q2, d1, d25 ; G
+ vmlal.u8 q2, d2, d26 ; R
+ vqrshrun.s16 d0, q2, #7 ; 15 bit to 8 bit B
+ vmov d1, d0 ; G
+ vmov d2, d0 ; R
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q12 - q13}
+ vpop {q0 - q2}
+ bx lr
+ ENDP
+
+; Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+; b = (r * 35 + g * 68 + b * 17) >> 7
+; g = (r * 45 + g * 88 + b * 22) >> 7
+; r = (r * 50 + g * 98 + b * 24) >> 7
+ARGBSepiaRow_NEON PROC
+ ; input
+ ; r0 = uint8* dst_argb
+ ; r1 = int width
+ vpush {q0- q3}
+ vpush {q10 - q15}
+
+ vmov.u8 d20, #17 ; BB coefficient
+ vmov.u8 d21, #68 ; BG coefficient
+ vmov.u8 d22, #35 ; BR coefficient
+ vmov.u8 d24, #22 ; GB coefficient
+ vmov.u8 d25, #88 ; GG coefficient
+ vmov.u8 d26, #45 ; GR coefficient
+ vmov.u8 d28, #24 ; BB coefficient
+ vmov.u8 d29, #98 ; BG coefficient
+ vmov.u8 d30, #50 ; BR coefficient
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0] ; load 8 ARGB pixels.
+ subs r1, r1, #8 ; 8 processed per loop.
+ vmull.u8 q2, d0, d20 ; B to Sepia B
+ vmlal.u8 q2, d1, d21 ; G
+ vmlal.u8 q2, d2, d22 ; R
+ vmull.u8 q3, d0, d24 ; B to Sepia G
+ vmlal.u8 q3, d1, d25 ; G
+ vmlal.u8 q3, d2, d26 ; R
+ vmull.u8 q8, d0, d28 ; B to Sepia R
+ vmlal.u8 q8, d1, d29 ; G
+ vmlal.u8 q8, d2, d30 ; R
+ vqshrn.u16 d0, q2, #7 ; 16 bit to 8 bit B
+ vqshrn.u16 d1, q3, #7 ; 16 bit to 8 bit G
+ vqshrn.u16 d2, q8, #7 ; 16 bit to 8 bit R
+ MEMACCESS 0
+ vst4.8 {d0, d1, d2, d3}, [r0]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q10 - q15}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; Tranform 8 ARGB pixels (32 bytes) with color matrix.
+; TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+; needs to saturate. Consider doing a non-saturating version.
+ARGBColorMatrixRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb
+ ; r1 = uint8* dst_argb
+ ; r2 = const int8* matrix_argb
+ ; r3 = int width
+ vpush {q0 - q7}
+ vpush {q8 - q14}
+ vpush {q15}
+
+ MEMACCESS 3
+ vld1.8 {q2}, [r2] ; load 3 ARGB vectors.
+ vmovl.s8 q0, d4 ; B,G coefficients s16.
+ vmovl.s8 q1, d5 ; R,A coefficients s16.
+
+1
+ MEMACCESS 0
+ vld4.8 {d16, d18, d20, d22}, [r0]! ; load 8 ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vmovl.u8 q8, d16 ; b (0 .. 255) 16 bit
+ vmovl.u8 q9, d18 ; g
+ vmovl.u8 q10, d20 ; r
+ vmovl.u8 q11, d22 ; a
+ vmul.s16 q12, q8, d0[0] ; B = B * Matrix B
+ vmul.s16 q13, q8, d1[0] ; G = B * Matrix G
+ vmul.s16 q14, q8, d2[0] ; R = B * Matrix R
+ vmul.s16 q15, q8, d3[0] ; A = B * Matrix A
+ vmul.s16 q4, q9, d0[1] ; B += G * Matrix B
+ vmul.s16 q5, q9, d1[1] ; G += G * Matrix G
+ vmul.s16 q6, q9, d2[1] ; R += G * Matrix R
+ vmul.s16 q7, q9, d3[1] ; A += G * Matrix A
+ vqadd.s16 q12, q12, q4 ; Accumulate B
+ vqadd.s16 q13, q13, q5 ; Accumulate G
+ vqadd.s16 q14, q14, q6 ; Accumulate R
+ vqadd.s16 q15, q15, q7 ; Accumulate A
+ vmul.s16 q4, q10, d0[2] ; B += R * Matrix B
+ vmul.s16 q5, q10, d1[2] ; G += R * Matrix G
+ vmul.s16 q6, q10, d2[2] ; R += R * Matrix R
+ vmul.s16 q7, q10, d3[2] ; A += R * Matrix A
+ vqadd.s16 q12, q12, q4 ; Accumulate B
+ vqadd.s16 q13, q13, q5 ; Accumulate G
+ vqadd.s16 q14, q14, q6 ; Accumulate R
+ vqadd.s16 q15, q15, q7 ; Accumulate A
+ vmul.s16 q4, q11, d0[3] ; B += A * Matrix B
+ vmul.s16 q5, q11, d1[3] ; G += A * Matrix G
+ vmul.s16 q6, q11, d2[3] ; R += A * Matrix R
+ vmul.s16 q7, q11, d3[3] ; A += A * Matrix A
+ vqadd.s16 q12, q12, q4 ; Accumulate B
+ vqadd.s16 q13, q13, q5 ; Accumulate G
+ vqadd.s16 q14, q14, q6 ; Accumulate R
+ vqadd.s16 q15, q15, q7 ; Accumulate A
+ vqshrun.s16 d16, q12, #6 ; 16 bit to 8 bit B
+ vqshrun.s16 d18, q13, #6 ; 16 bit to 8 bit G
+ vqshrun.s16 d20, q14, #6 ; 16 bit to 8 bit R
+ vqshrun.s16 d22, q15, #6 ; 16 bit to 8 bit A
+ MEMACCESS 1
+ vst4.8 {d16, d18, d20, d22}, [r1]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q15}
+ vpop {q8 - q14}
+ vpop {q0 - q7}
+ bx lr
+ ENDP
+
+ ; dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+ARGBBlendRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb0
+ ; r1 = const uint8* src_argb1
+ ; r2 = int8* dst_argb
+ ; r3 = int width
+ vpush {q0 - q3}
+ vpush {q10 - q12}
+
+ subs r3, #8
+ blt %f89
+ ; Blend 8 pixels.
+8
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ARGB0.
+ MEMACCESS 1
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 pixels of ARGB1.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vmull.u8 q10, d4, d3 ; db * a
+ vmull.u8 q11, d5, d3 ; dg * a
+ vmull.u8 q12, d6, d3 ; dr * a
+ vqrshrn.u16 d20, q10, #8 ; db >>= 8
+ vqrshrn.u16 d21, q11, #8 ; dg >>= 8
+ vqrshrn.u16 d22, q12, #8 ; dr >>= 8
+ vqsub.u8 q2, q2, q10 ; dbg - dbg * a / 256
+ vqsub.u8 d6, d6, d22 ; dr - dr * a / 256
+ vqadd.u8 q0, q0, q2 ; + sbg
+ vqadd.u8 d2, d2, d6 ; + sr
+ vmov.u8 d3, #255 ; a = 255
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 pixels of ARGB.
+ bge %b8
+
+89
+ adds r3, #8-1
+ blt %f99
+
+ ; Blend 1 pixels.
+1
+ MEMACCESS 0
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! ; load 1 pixel ARGB0.
+ MEMACCESS 1
+ vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [r1]! ; load 1 pixel ARGB1.
+ subs r3, r3, #1 ; 1 processed per loop.
+ vmull.u8 q10, d4, d3 ; db * a
+ vmull.u8 q11, d5, d3 ; dg * a
+ vmull.u8 q12, d6, d3 ; dr * a
+ vqrshrn.u16 d20, q10, #8 ; db >>= 8
+ vqrshrn.u16 d21, q11, #8 ; dg >>= 8
+ vqrshrn.u16 d22, q12, #8 ; dr >>= 8
+ vqsub.u8 q2, q2, q10 ; dbg - dbg * a / 256
+ vqsub.u8 d6, d6, d22 ; dr - dr * a / 256
+ vqadd.u8 q0, q0, q2 ; + sbg
+ vqadd.u8 d2, d2, d6 ; + sr
+ vmov.u8 d3, #255 ; a = 255
+ MEMACCESS 2
+ vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r2]! ; store 1 pixel.
+ bge %b1
+
+99
+
+ vpop {q10 - q12}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; Bilinear filter 16x2 -> 16x1
+InterpolateRow_NEON PROC
+ ; input
+ ; r0 = const uint8* dst_ptr
+ ; r1 = const uint8* src_ptr
+ ; r2 = int8* dst_argb
+ ; r3 = int width
+ push {r4}
+ ldr r4, [sp,#4] ; int width
+ vpush {q0 - q1}
+ vpush {d4 - d5}
+ vpush {q13 - q14}
+
+ cmp r4, #0
+ beq %f100
+ add r2, r1
+ cmp r4, #64
+ beq %f75
+ cmp r4, #128
+ beq %f50
+ cmp r4, #192
+ beq %f25
+
+ vdup.8 d5, r4
+ rsb r4, #256
+ vdup.8 d4, r4
+ ; General purpose row blend.
+1
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]!
+ MEMACCESS 2
+ vld1.8 {q1}, [r2]!
+ subs r3, r3, #16
+ vmull.u8 q13, d0, d4
+ vmull.u8 q14, d1, d4
+ vmlal.u8 q13, d2, d5
+ vmlal.u8 q14, d3, d5
+ vrshrn.u16 d0, q13, #8
+ vrshrn.u16 d1, q14, #8
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]!
+ bgt %b1
+ b %f99
+
+ ; Blend 25 / 75.
+25
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]!
+ MEMACCESS 2
+ vld1.8 {q1}, [r2]!
+ subs r3, r3, #16
+ vrhadd.u8 q0, q1
+ vrhadd.u8 q0, q1
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]!
+ bgt %b25
+ b %f99
+
+ ; Blend 50 / 50.
+50
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]!
+ MEMACCESS 2
+ vld1.8 {q1}, [r2]!
+ subs r3, r3, #16
+ vrhadd.u8 q0, q1
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]!
+ bgt %b50
+ b %f99
+
+ ; Blend 75 / 25.
+75
+ MEMACCESS 1
+ vld1.8 {q1}, [r1]!
+ MEMACCESS 2
+ vld1.8 {q0}, [r2]!
+ subs r3, r3, #16
+ vrhadd.u8 q0, q1
+ vrhadd.u8 q0, q1
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]!
+ bgt %b75
+ b %f99
+
+ ; Blend 100 / 0 - Copy row unchanged.
+100
+ MEMACCESS 1
+ vld1.8 {q0}, [r1]!
+ subs r3, r3, #16
+ MEMACCESS 0
+ vst1.8 {q0}, [r0]!
+ bgt %b100
+
+99
+
+ vpop {q13 - q14}
+ vpop {d4 - d5}
+ vpop {q0 - q1}
+ pop {r4}
+ bx lr
+ ENDP
+
+; Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+ARGBMultiplyRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_argb0
+ ; r1 = const uint8* src_argb1
+ ; r2 = int8* dst_argb
+ ; r3 = int width
+ vpush {q0 - q3}
+
+ ; 8 pixel loop.
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 1
+ vld4.8 {d1, d3, d5, d7}, [r1]! ; load 8 more ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vmull.u8 q0, d0, d1 ; multiply B
+ vmull.u8 q1, d2, d3 ; multiply G
+ vmull.u8 q2, d4, d5 ; multiply R
+ vmull.u8 q3, d6, d7 ; multiply A
+ vrshrn.u16 d0, q0, #8 ; 16 bit to 8 bit B
+ vrshrn.u16 d1, q1, #8 ; 16 bit to 8 bit G
+ vrshrn.u16 d2, q2, #8 ; 16 bit to 8 bit R
+ vrshrn.u16 d3, q3, #8 ; 16 bit to 8 bit A
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+; SobelX as a matrix is
+; -1 0 1
+; -2 0 2
+; -1 0 1
+SobelXRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y0
+ ; r1 = const uint8* src_y1
+ ; r2 = const uint8* src_y2
+ ; r3 = uint8* dst_sobelx
+ push {r4 - r6}
+ ldr r4, [sp,#12] ; int width
+ vpush {q0 - q1}
+ mov r5, 2
+ mov r6, 6
+
+1
+ MEMACCESS 0
+ vld1.8 {d0}, [r0],r5 ; top
+ MEMACCESS 0
+ vld1.8 {d1}, [r0],r6
+ vsubl.u8 q0, d0, d1
+ MEMACCESS 1
+ vld1.8 {d2}, [r1],r5 ; center * 2
+ MEMACCESS 1
+ vld1.8 {d3}, [r1],r6
+ vsubl.u8 q1, d2, d3
+ vadd.s16 q0, q0, q1
+ vadd.s16 q0, q0, q1
+ MEMACCESS 2
+ vld1.8 {d2}, [r2],r5 ; bottom
+ MEMACCESS 2
+ vld1.8 {d3}, [r2],r6
+ subs r4, r4, #8 ; 8 pixels
+ vsubl.u8 q1, d2, d3
+ vadd.s16 q0, q0, q1
+ vabs.s16 q0, q0
+ vqmovn.u16 d0, q0
+ MEMACCESS 3
+ vst1.8 {d0}, [r3]! ; store 8 sobelx
+ bgt %b1
+
+ vpop {q0 - q1}
+ pop {r4 - r6}
+ bx lr
+ ENDP
+
+; SobelY as a matrix is
+; -1 -2 -1
+; 0 0 0
+; 1 2 1
+SobelYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_y0
+ ; r1 = const uint8* src_y1
+ ; r2 = uint8* dst_sobely
+ ; r3 = int width
+ vpush {q0 - q1}
+ push {r4 - r5}
+ mov r4, 1
+ mov r5, 6
+
+1
+ MEMACCESS 0
+ vld1.8 {d0}, [r0],r4 ; left
+ MEMACCESS 1
+ vld1.8 {d1}, [r1],r4
+ vsubl.u8 q0, d0, d1
+ MEMACCESS 0
+ vld1.8 {d2}, [r0],r4 ; center * 2
+ MEMACCESS 1
+ vld1.8 {d3}, [r1],r4
+ vsubl.u8 q1, d2, d3
+ vadd.s16 q0, q0, q1
+ vadd.s16 q0, q0, q1
+ MEMACCESS 0
+ vld1.8 {d2}, [r0],r5 ; right
+ MEMACCESS 1
+ vld1.8 {d3}, [r1],r5
+ subs r3, r3, #8 ; 8 pixels
+ vsubl.u8 q1, d2, d3
+ vadd.s16 q0, q0, q1
+ vabs.s16 q0, q0
+ vqmovn.u16 d0, q0
+ MEMACCESS 2
+ vst1.8 {d0}, [r2]! ; store 8 sobely
+ bgt %b1
+
+ pop {r4 - r5}
+ vpop {q0 - q1}
+ bx lr
+ ENDP
+
+; Mixes Sobel X, Sobel Y and Sobel into ARGB.
+; A = 255
+; R = Sobel X
+; G = Sobel
+; B = Sobel Y
+SobelXYRow_NEON PROC
+ ; input
+ ; r0 = const uint8* src_sobelx
+ ; r1 = const uint8* src_sobely
+ ; r2 = uint8* dst_argb
+ ; r3 = int width
+ vpush {q0 - q1}
+
+ vmov.u8 d3, #255 ; alpha
+ ; 8 pixel loop.
+
+1
+ MEMACCESS 0
+ vld1.8 {d2}, [r0]! ; load 8 sobelx.
+ MEMACCESS 1
+ vld1.8 {d0}, [r1]! ; load 8 sobely.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vqadd.u8 d1, d0, d2 ; add
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels.
+ bgt %b1
+
+ vpop {q0 - q1}
+ bx lr
+ ENDP
+
+
+ END
+
+
diff --git a/source/row_neon.cc b/source/row_neon.cc
index bed14e0..3450302 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -17,6 +17,15 @@ namespace libyuv {
extern "C" {
#endif
+ /* !!! IMPORTANT: Following methods has been ported to pure assembler to row_neon.asm,
+ * because MS Visual Studio doesn't support inline assembler for ARM.
+ *
+ * ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN row_neon.asm
+ *
+ * Eventually, only pure assembler implementation should be used for all platforms
+ * to avoid code duplication.
+ */
+
// This module is for GCC Neon
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
diff --git a/source/scale_neon.asm b/source/scale_neon.asm
new file mode 100644
index 0000000..8ede0ec
--- /dev/null
+++ b/source/scale_neon.asm
@@ -0,0 +1,970 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ AREA |.text|, CODE, READONLY, ALIGN=2
+
+ GET source/arm_asm_macros.in
+
+ EXPORT ScaleRowDown2_NEON
+ EXPORT ScaleRowDown2Linear_NEON
+ EXPORT ScaleRowDown2Box_NEON
+ EXPORT ScaleRowDown4_NEON
+ EXPORT ScaleRowDown4Box_NEON
+ EXPORT ScaleRowDown34_NEON
+ EXPORT ScaleRowDown34_0_Box_NEON
+ EXPORT ScaleRowDown34_1_Box_NEON
+ EXPORT ScaleRowDown38_NEON
+ EXPORT ScaleRowDown38_3_Box_NEON
+ EXPORT ScaleRowDown38_2_Box_NEON
+ EXPORT ScaleAddRows_NEON
+ EXPORT ScaleFilterCols_NEON
+ EXPORT ScaleARGBRowDown2_NEON
+ EXPORT ScaleARGBRowDown2Linear_NEON
+ EXPORT ScaleARGBRowDown2Box_NEON
+ EXPORT ScaleARGBRowDownEven_NEON
+ EXPORT ScaleARGBRowDownEvenBox_NEON
+ EXPORT ScaleARGBCols_NEON
+ EXPORT ScaleARGBFilterCols_NEON
+ EXPORT ScaleARGBCols_NEON
+
+kShuf38 DCB 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0
+kShuf38_2 DCB 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0
+;vec16 kMult38_Div6 = { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }
+kMult38_Div6 DCW 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555
+;vec16 kMult38_Div9 = { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+kMult38_Div9 DCW 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38
+
+
+; Read 32x1 throw away even pixels, and write 16x1
+ScaleRowDown2_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_width
+ vpush {q0, q1}
+1
+ ; load even pixels into q0, odd into q1
+ MEMACCESS 0
+ vld2.8 {q0, q1}, [r0]!
+ subs r3, r3, #16 ; 16 processed per loop
+ MEMACCESS 1
+ vst1.8 {q1}, [r2]! ; store odd pixels
+ bgt %b1
+
+ vpop {q0, q1}
+
+ bx lr
+ ENDP
+
+; Read 32x1 average down and write 16x1.
+ScaleRowDown2Linear_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_width
+ vpush {q0, q1}
+1
+ MEMACCESS 0
+ vld1.8 {q0, q1}, [r0]! ; load pixels and post inc
+ subs r3, r3, #16 ; 16 processed per loop
+ vpaddl.u8 q0, q0 ; add adjacent
+ vpaddl.u8 q1, q1
+ vrshrn.u16 d0, q0, #1 ; downshift, round and pack
+ vrshrn.u16 d1, q1, #1
+ MEMACCESS 1
+ vst1.8 {q0}, [r2]!
+ bgt %b1
+ vpop {q0, q1}
+
+ bx lr
+ ENDP
+
+; Read 32x2 average down and write 16x1
+ScaleRowDown2Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_width
+ ; This file was created from a .asm file
+ vpush {q0, q1, q2, q3}
+ add r1, r0
+1
+ MEMACCESS 0
+ vld1.8 {q0, q1}, [r0]! ; load row 1 and post inc
+ MEMACCESS 1
+ vld1.8 {q2, q3}, [r1]! ; load row 2 and post inc
+ subs r3, r3, #16 ; 16 processed per loop
+ vpaddl.u8 q0, q0 ; row 1 add adjacent
+ vpaddl.u8 q1, q1
+ vpadal.u8 q0, q2 ; row 2 add adjacent + row1
+ vpadal.u8 q1, q3
+ vrshrn.u16 d0, q0, #2 ; downshift, round and pack
+ vrshrn.u16 d1, q1, #2
+ MEMACCESS 2
+ vst1.8 {q0}, [r2]!
+ bgt %b1
+ vpop {q0, q1, q2, q3}
+
+ bx lr
+ ENDP
+
+ScaleRowDown4_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {q0, q1}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0
+ subs r3, r3, #8 ; 8 processed per loop
+ MEMACCESS 1
+ vst1.8 {d2}, [r2]!
+ bgt %b1
+
+ vpop {q0, q1}
+ bx lr
+ ENDP
+
+ScaleRowDown4Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ push {r4-r6}
+ vpush {q0-q3}
+ add r4, r0, r1 ; src_ptr + src_stride
+ add r5, r4, r1 ; src_ptr + src_stride * 2
+ add r6, r5, r1 ; src_ptr + src_stride * 3
+
+1
+ MEMACCESS 0
+ vld1.8 {q0}, [r0]! ; load up 16x4
+ MEMACCESS 3
+ vld1.8 {q1}, [r4]!
+ MEMACCESS 4
+ vld1.8 {q2}, [r5]!
+ MEMACCESS 5
+ vld1.8 {q3}, [r6]!
+ subs r3, r3, #4
+ vpaddl.u8 q0, q0
+ vpadal.u8 q0, q1
+ vpadal.u8 q0, q2
+ vpadal.u8 q0, q3
+ vpaddl.u16 q0, q0
+ vrshrn.u32 d0, q0, #4 ; divide by 16 w/rounding
+ vmovn.u16 d0, q0
+ MEMACCESS 1
+ vst1.32 {d0[0]}, [r2]!
+ bgt %b1
+
+ vpop {q0-q3}
+ pop {r4-r6}
+ bx lr
+ ENDP
+
+ScaleRowDown34_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {d0-d3}
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0
+ subs r3, r3, #24
+ vmov d2, d3 ; order d0, d1, d2
+ MEMACCESS 1
+ vst3.8 {d0, d1, d2}, [r2]!
+ bgt %b1
+
+ vpop {d0-d3}
+ bx lr
+ ENDP
+
+ScaleRowDown34_0_Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {q0-q3}
+ vpush {q8-q11}
+ vpush {d24}
+
+ vmov.u8 d24, #3
+ add r1, r0
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0
+ MEMACCESS 3
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; src line 1
+ subs r3, r3, #24
+
+ ; filter src line 0 with src line 1
+ ; expand chars to shorts to allow for room
+ ; when adding lines together
+ vmovl.u8 q8, d4
+ vmovl.u8 q9, d5
+ vmovl.u8 q10, d6
+ vmovl.u8 q11, d7
+
+ ; 3 * line_0 + line_1
+ vmlal.u8 q8, d0, d24
+ vmlal.u8 q9, d1, d24
+ vmlal.u8 q10, d2, d24
+ vmlal.u8 q11, d3, d24
+
+ ; (3 * line_0 + line_1) >> 2
+ vqrshrn.u16 d0, q8, #2
+ vqrshrn.u16 d1, q9, #2
+ vqrshrn.u16 d2, q10, #2
+ vqrshrn.u16 d3, q11, #2
+
+ ; a0 = (src[0] * 3 + s[1] * 1) >> 2
+ vmovl.u8 q8, d1
+ vmlal.u8 q8, d0, d24
+ vqrshrn.u16 d0, q8, #2
+
+ ; a1 = (src[1] * 1 + s[2] * 1) >> 1
+ vrhadd.u8 d1, d1, d2
+
+ ; a2 = (src[2] * 1 + s[3] * 3) >> 2
+ vmovl.u8 q8, d2
+ vmlal.u8 q8, d3, d24
+ vqrshrn.u16 d2, q8, #2
+
+ MEMACCESS 1
+ vst3.8 {d0, d1, d2}, [r2]!
+
+ bgt %b1
+
+
+ vpop {d24}
+ vpop {q8-q11}
+ vpop {q0-q3}
+ bx lr
+ ENDP
+
+ScaleRowDown34_1_Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {q0-q3}
+ vpush {d24}
+ vmov.u8 d24, #3
+ add r1, r0
+1
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0
+ MEMACCESS 3
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; src line 1
+ subs r3, r3, #24
+ ; average src line 0 with src line 1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+
+ ; a0 = (src[0] * 3 + s[1] * 1) >> 2
+ vmovl.u8 q3, d1
+ vmlal.u8 q3, d0, d24
+ vqrshrn.u16 d0, q3, #2
+
+ ; a1 = (src[1] * 1 + s[2] * 1) >> 1
+ vrhadd.u8 d1, d1, d2
+
+ ; a2 = (src[2] * 1 + s[3] * 3) >> 2
+ vmovl.u8 q3, d2
+ vmlal.u8 q3, d3, d24
+ vqrshrn.u16 d2, q3, #2
+
+ MEMACCESS 1
+ vst3.8 {d0, d1, d2}, [r2]!
+ bgt %b1
+
+ vpop {d24}
+ vpop {q0-q3}
+ bx lr
+ ENDP
+
+ScaleRowDown38_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {d0-d5}
+ push {r4}
+
+ adr R4, kShuf38
+
+ vld1.8 {q3}, [r4]
+1
+ MEMACCESS 0
+ vld1.8 {d0, d1, d2, d3}, [r0]!
+ subs r3, r3, #12
+ vtbl.u8 d4, {d0, d1, d2, d3}, d6
+ vtbl.u8 d5, {d0, d1, d2, d3}, d7
+ MEMACCESS(1)
+ vst1.8 {d4}, [r2]!
+ MEMACCESS(1)
+ vst1.32 {d5[0]}, [r2]!
+ bgt %b1
+
+ vpop {d0-d5}
+ pop {r4}
+ bx lr
+ ENDP
+
+ScaleRowDown38_3_Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {q0-q3}
+ vpush {q8, q9}
+ vpush {q13-q15}
+ push {r4-r7}
+ add r4, r0, r1
+ add r4, r4, r1 ; src_ptr + src_stride * 2
+ adr r5, kMult38_Div6
+ adr r6, kShuf38_2
+ adr r7, kMult38_Div9
+
+ MEMACCESS 5
+ vld1.16 {q13}, [r5]
+ MEMACCESS 6
+ vld1.8 {q14}, [r6]
+ MEMACCESS 7
+ vld1.8 {q15}, [r7]
+ add r1, r0
+1
+ ; d0 = 00 40 01 41 02 42 03 43
+ ; d1 = 10 50 11 51 12 52 13 53
+ ; d2 = 20 60 21 61 22 62 23 63
+ ; d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]!
+ MEMACCESS 3
+ vld4.8 {d4, d5, d6, d7}, [r1]!
+ MEMACCESS 4
+ vld4.8 {d16, d17, d18, d19}, [r4]!
+ subs r3, r3, #12
+
+ ; Shuffle the input data around to get align the data
+ ; so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ ; d0 = 00 10 01 11 02 12 03 13
+ ; d1 = 40 50 41 51 42 52 43 53
+ vtrn.u8 d0, d1
+ vtrn.u8 d4, d5
+ vtrn.u8 d16, d17
+
+ ; d2 = 20 30 21 31 22 32 23 33
+ ; d3 = 60 70 61 71 62 72 63 73
+ vtrn.u8 d2, d3
+ vtrn.u8 d6, d7
+ vtrn.u8 d18, d19
+
+ ; d0 = 00+10 01+11 02+12 03+13
+ ; d2 = 40+50 41+51 42+52 43+53
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q8, q8
+
+ ; d3 = 60+70 61+71 62+72 63+73
+ vpaddl.u8 d3, d3
+ vpaddl.u8 d7, d7
+ vpaddl.u8 d19, d19
+
+ ; combine source lines
+ vadd.u16 q0, q2
+ vadd.u16 q0, q8
+ vadd.u16 d4, d3, d7
+ vadd.u16 d4, d19
+
+ ; dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ ; + s[6 + st * 1] + s[7 + st * 1]
+ ; + s[6 + st * 2] + s[7 + st * 2]) / 6
+ vqrdmulh.s16 q2, q2, q13
+ vmovn.u16 d4, q2
+
+ ; Shuffle 2,3 reg around so that 2 can be added to the
+ ; 0,1 reg and 3 can be added to the 4,5 reg. This
+ ; requires expanding from u8 to u16 as the 0,1 and 4,5
+ ; registers are already expanded. Then do transposes
+ ; to get aligned.
+ ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ vmovl.u8 q1, d2
+ vmovl.u8 q3, d6
+ vmovl.u8 q9, d18
+
+ ; combine source lines
+ vadd.u16 q1, q3
+ vadd.u16 q1, q9
+
+ ; d4 = xx 20 xx 30 xx 22 xx 32
+ ; d5 = xx 21 xx 31 xx 23 xx 33
+ vtrn.u32 d2, d3
+
+ ; d4 = xx 20 xx 21 xx 22 xx 23
+ ; d5 = xx 30 xx 31 xx 32 xx 33
+ vtrn.u16 d2, d3
+
+ ; 0+1+2, 3+4+5
+ vadd.u16 q0, q1
+
+ ; Need to divide, but can't downshift as the the value
+ ; isn't a power of 2. So multiply by 65536 / n
+ ; and take the upper 16 bits.
+ vqrdmulh.s16 q0, q0, q15
+
+ ; Align for table lookup, vtbl requires registers to
+ ; be adjacent
+ vmov.u8 d2, d4
+
+ vtbl.u8 d3, {d0, d1, d2}, d28
+ vtbl.u8 d4, {d0, d1, d2}, d29
+
+ MEMACCESS 1
+ vst1.8 {d3}, [r2]!
+ MEMACCESS 1
+ vst1.32 {d4[0]}, [r2]!
+ bgt %b1
+
+ pop {r4-r7}
+ vpop {q13-q15}
+ vpop {q8, q9}
+ vpop {q0-q3}
+ bx lr
+ ENDP
+
+ScaleRowDown38_2_Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint8* dst_ptr
+ ; r3 = int dst_width
+ vpush {q0-q3}
+ vpush {q13-q14}
+ push {r4, r5}
+ adr r4, kMult38_Div6
+ adr r5, kShuf38_2
+
+ MEMACCESS 4
+ vld1.16 {q13}, [r4]
+ MEMACCESS 5
+ vld1.8 {q14}, [r5]
+ add r1, r0
+1
+ ; d0 = 00 40 01 41 02 42 03 43
+ ; d1 = 10 50 11 51 12 52 13 53
+ ; d2 = 20 60 21 61 22 62 23 63
+ ; d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS 0
+ vld4.8 {d0, d1, d2, d3}, [r0]!
+ MEMACCESS 3
+ vld4.8 {d4, d5, d6, d7}, [r1]!
+ subs r3, r3, #12
+
+ ; Shuffle the input data around to get align the data
+ ; so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ ; d0 = 00 10 01 11 02 12 03 13
+ ; d1 = 40 50 41 51 42 52 43 53
+ vtrn.u8 d0, d1
+ vtrn.u8 d4, d5
+
+ ; d2 = 20 30 21 31 22 32 23 33
+ ; d3 = 60 70 61 71 62 72 63 73
+ vtrn.u8 d2, d3
+ vtrn.u8 d6, d7
+
+ ; d0 = 00+10 01+11 02+12 03+13
+ ; d2 = 40+50 41+51 42+52 43+53
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+
+ ; d3 = 60+70 61+71 62+72 63+73
+ vpaddl.u8 d3, d3
+ vpaddl.u8 d7, d7
+
+ ; combine source lines
+ vadd.u16 q0, q2
+ vadd.u16 d4, d3, d7
+
+ ; dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ vqrshrn.u16 d4, q2, #2
+
+ ; Shuffle 2,3 reg around so that 2 can be added to the
+ ; 0,1 reg and 3 can be added to the 4,5 reg. This
+ ; requires expanding from u8 to u16 as the 0,1 and 4,5
+ ; registers are already expanded. Then do transposes
+ ; to get aligned.
+ ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ vmovl.u8 q1, d2
+ vmovl.u8 q3, d6
+
+ ; combine source lines
+ vadd.u16 q1, q3
+
+ ; d4 = xx 20 xx 30 xx 22 xx 32
+ ; d5 = xx 21 xx 31 xx 23 xx 33
+ vtrn.u32 d2, d3
+
+ ; d4 = xx 20 xx 21 xx 22 xx 23
+ ; d5 = xx 30 xx 31 xx 32 xx 33
+ vtrn.u16 d2, d3
+
+ ; 0+1+2, 3+4+5
+ vadd.u16 q0, q1
+
+ ; Need to divide, but can't downshift as the the value
+ ; isn't a power of 2. So multiply by 65536 / n
+ ; and take the upper 16 bits.
+ vqrdmulh.s16 q0, q0, q13
+
+ ; Align for table lookup, vtbl requires registers to
+ ; be adjacent
+ vmov.u8 d2, d4
+
+ vtbl.u8 d3, {d0, d1, d2}, d28
+ vtbl.u8 d4, {d0, d1, d2}, d29
+
+ MEMACCESS 1
+ vst1.8 {d3}, [r2]!
+ MEMACCESS 1
+ vst1.32 {d4[0]}, [r2]!
+ bgt %b1
+
+ pop {r4, r5}
+ vpop {q13-q14}
+ vpop {q0-q3}
+ bx lr
+ ENDP
+
+ScaleAddRows_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = src_stride
+ ; r2 = uint16* dst_ptr
+ ; r3 = int dst_width
+ push {r4, r5, r12}
+ ldr r4, [SP, #12] ; int src_height
+ mov r5, 0
+ vpush {q0-q3}
+
+1
+ mov r5, r0
+ mov r12, r4
+ veor q2, q2, q2
+ veor q3, q3, q3
+2
+ ; load 16 pixels into q0
+ MEMACCESS 0
+ vld1.8 {q0}, [r5], r1
+ vaddw.u8 q3, q3, d1
+ vaddw.u8 q2, q2, d0
+ subs r12, r12, #1
+ bgt %b2
+ MEMACCESS 2
+ vst1.16 {q2, q3}, [r2]! ; store pixels
+ add r0, r0, #16
+ subs r3, r3, #16 ; 16 processed per loop
+ bgt %b1
+
+ vpop {q0-q3}
+ pop {r4, r5, r12}
+ bx lr
+ ENDP
+
+; TODO(Yang Zhang): Investigate less load instructions for
+; the x/dx stepping
+ MACRO
+ LOAD2_DATA8_LANE $n
+ lsr r5, r3, #16
+ add r6, r1, r5
+ add r3, r3, r4
+ MEMACCESS 6
+ vld2.8 {d6[$n], d7[$n]}, [r6]
+ MEND
+
+dx_offset DCD 0, 1, 2, 3
+
+; The NEON version mimics this formula:
+; #define BLENDER(a, b, f) (uint8)((int)(a) +
+; ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+ScaleFilterCols_NEON PROC
+ ; input
+ ; r0 = uint8* dst_ptr
+ ; r1 = uint8* src_ptr
+ ; r2 = int dst_width
+ ; r3 = int x
+
+ push {r4-r6}
+
+ ldr r4, [sp, #12] ; int dx
+ adr r5, dx_offset
+ mov r6, r1
+
+ vpush {q0-q3}
+ vpush {q8-q13}
+
+ vdup.32 q0, r3 ; x
+ vdup.32 q1, r4 ; dx
+ vld1.32 {q2}, [r5] ; 0 1 2 3
+ vshl.i32 q3, q1, #2 ; 4 * dx
+ vmul.s32 q1, q1, q2
+ ; x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ vadd.s32 q1, q1, q0
+ ; x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ vadd.s32 q2, q1, q3
+ vshl.i32 q0, q3, #1 ; 8 * dx
+1
+ LOAD2_DATA8_LANE 0
+ LOAD2_DATA8_LANE 1
+ LOAD2_DATA8_LANE 2
+ LOAD2_DATA8_LANE 3
+ LOAD2_DATA8_LANE 4
+ LOAD2_DATA8_LANE 5
+ LOAD2_DATA8_LANE 6
+ LOAD2_DATA8_LANE 7
+ vmov q10, q1
+ vmov q11, q2
+ vuzp.16 q10, q11
+ vmovl.u8 q8, d6
+ vmovl.u8 q9, d7
+ vsubl.s16 q11, d18, d16
+ vsubl.s16 q12, d19, d17
+ vmovl.u16 q13, d20
+ vmovl.u16 q10, d21
+ vmul.s32 q11, q11, q13
+ vmul.s32 q12, q12, q10
+ vrshrn.s32 d18, q11, #16
+ vrshrn.s32 d19, q12, #16
+ vadd.s16 q8, q8, q9
+ vmovn.s16 d6, q8
+
+ MEMACCESS 0
+ vst1.8 {d6}, [r0]! ; store pixels
+ vadd.s32 q1, q1, q0
+ vadd.s32 q2, q2, q0
+ subs r2, r2, #8 ; 8 processed per loop
+ bgt %b1
+
+ vpop {q8-q13}
+ vpop {q0-q3}
+ pop {r4-r6}
+ bx lr
+ ENDP
+
+ScaleARGBRowDown2_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = ptrdiff_t src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_width
+ vpush {q0 - q3}
+1
+ ; load even pixels into q0, odd into q1
+ MEMACCESS 0
+ vld2.32 {q0, q1}, [r0]!
+ MEMACCESS 0
+ vld2.32 {q2, q3}, [r0]!
+ subs r3, r3, #8 ; 8 processed per loop
+ MEMACCESS 1
+ vst1.8 {q1}, [r2]! ; store odd pixels
+ MEMACCESS 1
+ vst1.8 {q3}, [r2]!
+ bgt %b1
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+
+
+ScaleARGBRowDown2Linear_NEON PROC
+ ; input
+ ; r0 = uint8* src_argb
+ ; r1 = ptrdiff_t src_stride
+ ; r2 = uint8* dst_argb
+ ; r3 = int dst_width
+ vpush {q0 - q3}
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels.
+ subs r3, r3, #8 ; 8 processed per loop
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts.
+ vpaddl.u8 q3, q3 ; A 16 bytes -> 8 shorts.
+ vrshrn.u16 d0, q0, #1 ; downshift, round and pack
+ vrshrn.u16 d1, q1, #1
+ vrshrn.u16 d2, q2, #1
+ vrshrn.u16 d3, q3, #1
+ MEMACCESS 1
+ vst4.8 {d0, d1, d2, d3}, [r2]!
+ bgt %b1
+
+
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ScaleARGBRowDown2Box_NEON PROC
+ ; input
+ ; r0 = uint8* src_ptr
+ ; r1 = ptrdiff_t src_stride
+ ; r2 = uint8* dst
+ ; r3 = int dst_width
+ vpush {q0 - q3}
+ vpush {q8 - q11}
+ ; change the stride to row 2 pointer
+ add r1, r1, r0
+
+1
+ MEMACCESS 0
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 argb pixels.
+ MEMACCESS 0
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 argb pixels.
+ subs r3, r3, #8 ; 8 processed per loop.
+ vpaddl.u8 q0, q0 ; b 16 bytes -> 8 shorts.
+ vpaddl.u8 q1, q1 ; g 16 bytes -> 8 shorts.
+ vpaddl.u8 q2, q2 ; r 16 bytes -> 8 shorts.
+ vpaddl.u8 q3, q3 ; a 16 bytes -> 8 shorts.
+ MEMACCESS 1
+ vld4.8 {d16, d18, d20, d22}, [r1]! ; load 8 more argb pixels.
+ MEMACCESS 1
+ vld4.8 {d17, d19, d21, d23}, [r1]! ; load last 8 argb pixels.
+ vpadal.u8 q0, q8 ; b 16 bytes -> 8 shorts.
+ vpadal.u8 q1, q9 ; g 16 bytes -> 8 shorts.
+ vpadal.u8 q2, q10 ; r 16 bytes -> 8 shorts.
+ vpadal.u8 q3, q11 ; a 16 bytes -> 8 shorts.
+ vrshrn.u16 d0, q0, #2 ; downshift, round and pack
+ vrshrn.u16 d1, q1, #2
+ vrshrn.u16 d2, q2, #2
+ vrshrn.u16 d3, q3, #2
+ MEMACCESS 2
+ vst4.8 {d0, d1, d2, d3}, [r2]!
+ bgt %b1
+
+ vpop {q8 - q11}
+ vpop {q0 - q3}
+ bx lr
+ ENDP
+
+ScaleARGBRowDownEven_NEON PROC
+ ; input
+ ; r0 = uint8* src_argb
+ ; r1 = ptrdiff_t src_stride
+ ; r2 = int src_stepx
+ ; r3 = uint8* dst_argb
+ push {r4, r12}
+ ldr r4, [sp, #8] ;int dst_width
+ vpush {q0}
+
+ mov r12, r2, lsl #2
+1
+ MEMACCESS 0
+ vld1.32 {d0[0]}, [r0], r12
+ MEMACCESS 0
+ vld1.32 {d0[1]}, [r0], r12
+ MEMACCESS 0
+ vld1.32 {d1[0]}, [r0], r12
+ MEMACCESS 0
+ vld1.32 {d1[1]}, [r0], r12
+ subs r4, r4, #4 ; 4 pixels per loop.
+ MEMACCESS 1
+ vst1.8 {q0}, [r3]!
+ bgt %b1
+
+ vpop {q0}
+ pop {r4, r12}
+ bx lr
+ ENDP
+
+ScaleARGBRowDownEvenBox_NEON PROC
+ ; input
+ ; r0 = uint8* src_argb
+ ; r1 = ptrdiff_t src_stride
+ ; r2 = int src_stepx
+ ; r3 = uint8* dst_argb
+ push {r4, r12}
+ ldr r4, [sp, #8] ;int dst_width
+ vpush {q0 - q3}
+
+ mov r12, r2, lsl #2
+ add r1, r1, r0
+1
+ MEMACCESS 0
+ vld1.8 {d0}, [r0], r12 ; Read 4 2x2 blocks -> 2x1
+ MEMACCESS 1
+ vld1.8 {d1}, [r1], r12
+ MEMACCESS 0
+ vld1.8 {d2}, [r0], r12
+ MEMACCESS 1
+ vld1.8 {d3}, [r1], r12
+ MEMACCESS 0
+ vld1.8 {d4}, [r0], r12
+ MEMACCESS 1
+ vld1.8 {d5}, [r1], r12
+ MEMACCESS 0
+ vld1.8 {d6}, [r0], r12
+ MEMACCESS 1
+ vld1.8 {d7}, [r1], r12
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vswp.8 d1, d2 ; ab_cd -> ac_bd
+ vswp.8 d5, d6 ; ef_gh -> eg_fh
+ vadd.u16 q0, q0, q1 ; (a+b)_(c+d)
+ vadd.u16 q2, q2, q3 ; (e+f)_(g+h)
+ vrshrn.u16 d0, q0, #2 ; first 2 pixels.
+ vrshrn.u16 d1, q2, #2 ; next 2 pixels.
+ subs r4, r4, #4 ; 4 pixels per loop.
+ MEMACCESS 2
+ vst1.8 {q0}, [r3]!
+ bgt %b1
+
+ vpop {q0 - q3}
+ pop {r4, r12}
+ bx lr
+ ENDP
+
+ ; TODO(Yang Zhang): Investigate less load instructions for
+ ; the x/dx stepping
+ MACRO
+ LOAD1_DATA32_LANE $dn, $n
+ lsr r5, r3, #16
+ add r6, r1, r5, lsl #2
+ add r3, r3, r4
+ MEMACCESS 6
+ vld1.32 {$dn[$n]}, [r6]
+ MEND
+
+ScaleARGBCols_NEON PROC
+ ; input
+ ; r0 = uint8* dst_argb
+ ; r1 = const uint8* src_argb
+ ; r2 = int dst_width
+ ; r3 = int x
+ push {r4 - r6}
+ ldr r4, [sp,#12] ; int dx
+ mov r6, r1
+ vpush {q0, q1}
+
+1
+ LOAD1_DATA32_LANE d0, 0
+ LOAD1_DATA32_LANE d0, 1
+ LOAD1_DATA32_LANE d1, 0
+ LOAD1_DATA32_LANE d1, 1
+ LOAD1_DATA32_LANE d2, 0
+ LOAD1_DATA32_LANE d2, 1
+ LOAD1_DATA32_LANE d3, 0
+ LOAD1_DATA32_LANE d3, 1
+
+ MEMACCESS 0
+ vst1.32 {q0, q1}, [r0]! ; store pixels
+ subs r2, r2, #8 ; 8 processed per loop
+ bgt %b1
+
+
+ vpop {q0, q1}
+ pop {r4 - r6}
+ bx lr
+ ENDP
+
+ ; TODO(Yang Zhang): Investigate less load instructions for
+ ; the x/dx stepping
+ MACRO
+ LOAD2_DATA32_LANE $dn1, $dn2, $n
+ lsr r5, r3, #16
+ add r6, r1, r5, lsl #2
+ add r3, r3, r4
+ MEMACCESS 6
+ vld2.32 {$dn1[$n], $dn2[$n]}, [r6]
+ MEND
+
+ScaleARGBFilterCols_NEON PROC
+ ; input
+ ; r0 = uint8* dst_argb
+ ; r1 = const uint8* src_argb
+ ; r2 = int dst_width
+ ; r3 = int x
+
+ push {r4 - r6}
+ ldr r4, [sp,#12] ;int dx
+ adr r5, dx_offset
+ mov r6, r1
+ vpush {q0 - q3}
+ vpush {q8 - q15}
+
+ vdup.32 q0, r3 ; x
+ vdup.32 q1, r4 ; dx
+ vld1.32 {q2}, [r5] ; 0 1 2 3
+ vshl.i32 q9, q1, #2 ; 4 * dx
+ vmul.s32 q1, q1, q2
+ vmov.i8 q3, #0x7f ; 0x7F
+ vmov.i16 q15, #0x7f ; 0x7F
+ ; x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ vadd.s32 q8, q1, q0
+1
+ ; d0, d1: a
+ ; d2, d3: b
+ LOAD2_DATA32_LANE d0, d2, 0
+ LOAD2_DATA32_LANE d0, d2, 1
+ LOAD2_DATA32_LANE d1, d3, 0
+ LOAD2_DATA32_LANE d1, d3, 1
+ vshrn.i32 d22, q8, #9
+ vand.16 d22, d22, d30
+ vdup.8 d24, d22[0]
+ vdup.8 d25, d22[2]
+ vdup.8 d26, d22[4]
+ vdup.8 d27, d22[6]
+ vext.8 d4, d24, d25, #4
+ vext.8 d5, d26, d27, #4 ; f
+ veor.8 q10, q2, q3 ; 0x7f ^ f
+ vmull.u8 q11, d0, d20
+ vmull.u8 q12, d1, d21
+ vmull.u8 q13, d2, d4
+ vmull.u8 q14, d3, d5
+ vadd.i16 q11, q11, q13
+ vadd.i16 q12, q12, q14
+ vshrn.i16 d0, q11, #7
+ vshrn.i16 d1, q12, #7
+
+ MEMACCESS 0
+ vst1.32 {d0, d1}, [r0]! ; store pixels
+ vadd.s32 q8, q8, q9
+ subs r2, r2, #4 ; 4 processed per loop
+ bgt %b1
+
+ vpop {q8 - q15}
+ vpop {q0 - q3}
+ pop {r4 - r6}
+ bx lr
+ ENDP
+
+ END
+
+
+
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 9b4dce3..f98c939 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -15,6 +15,16 @@ namespace libyuv {
extern "C" {
#endif
+ /* !!! IMPORTANT: Following methods has been ported to pure assembler to scale_neon.asm,
+ * because MS Visual Studio doesn't support inline assembler for ARM.
+ *
+ * ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN scale_neon.asm
+ *
+ * Eventually, only pure assembler implementation should be used for all platforms
+ * to avoid code duplication.
+ */
+
+
// This module is for GCC Neon.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 7f8bcf8..9dd7ae4 100644
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -388,7 +388,7 @@ LibYUVBaseTest::LibYUVBaseTest()
1280.0);
}
-int main(int argc, char** argv) {
+int main(int argc, char* argv[]) {
::testing::InitGoogleTest(&argc, argv);
// AllowCommandLineParsing allows us to ignore flags passed on to us by
// Chromium build bots without having to explicitly disable them.
diff --git a/util/compare.cc b/util/compare.cc
index ef0beef..5066b61 100644
--- a/util/compare.cc
+++ b/util/compare.cc
@@ -17,7 +17,7 @@
#include "libyuv/compare.h"
#include "libyuv/version.h"
-int main(int argc, char** argv) {
+int main(int argc, char* argv[]) {
if (argc < 1) {
printf("libyuv compare v%d\n", LIBYUV_VERSION);
printf("compare file1.yuv file2.yuv\n");
@@ -62,4 +62,5 @@ int main(int argc, char** argv) {
fclose(fin2);
}
fclose(fin1);
+ return 0;
}
diff --git a/util/convert.cc b/util/convert.cc
index acaf43a..3cb0b51 100644
--- a/util/convert.cc
+++ b/util/convert.cc
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
exit(0);
}
-void ParseOptions(int argc, const char* argv[]) {
+void ParseOptions(int argc, char* argv[]) {
if (argc <= 1)
PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
@@ -189,7 +189,7 @@ static int TileARGBScale(const uint8* src_argb,
return 0;
}
-int main(int argc, const char* argv[]) {
+int main(int argc, char* argv[]) {
ParseOptions(argc, argv);
// Open original file (first file argument)
diff --git a/util/psnr_main.cc b/util/psnr_main.cc
index 01e8777..2d412a3 100644
--- a/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -149,7 +149,7 @@ void PrintHelp(const char* program) {
exit(0);
}
-void ParseOptions(int argc, const char* argv[]) {
+void ParseOptions(int argc, char* argv[]) {
if (argc <= 1)
PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
@@ -315,7 +315,7 @@ bool UpdateMetrics(uint8* ch_org,
return ismin;
}
-int main(int argc, const char* argv[]) {
+int main(int argc, char* argv[]) {
ParseOptions(argc, argv);
if (!do_psnr && !do_ssim) {
do_psnr = true;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment