Created
June 29, 2017 20:41
-
-
Save robin-raymond/d6c0afbcad7840131bf46e3f53d1543c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/.gitignore b/.gitignore | |
index b0e9574..5360877 100644 | |
--- a/.gitignore | |
+++ b/.gitignore | |
@@ -33,3 +33,6 @@ source/*.o | |
# Files generated by perf | |
perf.data | |
perf.data.old | |
+*.vcxproj.filters | |
+*.vcxproj | |
+libyuv*.sln | |
\ No newline at end of file | |
diff --git a/BUILD.gn b/BUILD.gn | |
index 4f56cdc..1c02a46 100644 | |
--- a/BUILD.gn | |
+++ b/BUILD.gn | |
@@ -40,9 +40,15 @@ group("libyuv") { | |
public_configs = [ ":libyuv_config" ] | |
if (is_win && target_cpu == "x64") { | |
- public_deps = [ | |
- ":libyuv_internal(//build/toolchain/win:clang_x64)", | |
- ] | |
+ if (is_winuwp) { | |
+ public_deps = [ | |
+ ":libyuv_internal", | |
+ ] | |
+ } else { | |
+ public_deps = [ | |
+ ":libyuv_internal(//build/toolchain/win:clang_x64)", | |
+ ] | |
+ } | |
} else { | |
public_deps = [ | |
":libyuv_internal", | |
@@ -119,6 +125,9 @@ static_library("libyuv_internal") { | |
defines += [ "HAVE_JPEG" ] | |
deps += [ "//third_party:jpeg" ] | |
} | |
+ if (is_winuwp) { | |
+ deps += [ "//third_party/winuwp_compat:force_include_std" ] | |
+ } | |
if (libyuv_use_neon) { | |
deps += [ ":libyuv_neon" ] | |
@@ -257,6 +266,9 @@ if (libyuv_include_tests) { | |
if (is_android) { | |
deps += [ "//testing/android/native_test:native_test_native_code" ] | |
} | |
+ if (is_winuwp) { | |
+ deps += [ "//third_party/winuwp_compat:force_include_std" ] | |
+ } | |
# TODO(YangZhang): These lines can be removed when high accuracy | |
# YUV to RGB to Neon is ported. | |
@@ -287,6 +299,9 @@ if (libyuv_include_tests) { | |
if (is_linux) { | |
cflags = [ "-fexceptions" ] | |
} | |
+ if (is_winuwp) { | |
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ] | |
+ } | |
} | |
executable("convert") { | |
@@ -300,6 +315,9 @@ if (libyuv_include_tests) { | |
if (is_linux) { | |
cflags = [ "-fexceptions" ] | |
} | |
+ if (is_winuwp) { | |
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ] | |
+ } | |
} | |
executable("psnr") { | |
@@ -316,6 +334,9 @@ if (libyuv_include_tests) { | |
if (!is_ios && !libyuv_disable_jpeg) { | |
defines = [ "HAVE_JPEG" ] | |
} | |
+ if (is_winuwp) { | |
+ deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ] | |
+ } | |
} | |
executable("cpuid") { | |
diff --git a/armasm_ms.config b/armasm_ms.config | |
new file mode 100644 | |
index 0000000..b617231 | |
--- /dev/null | |
+++ b/armasm_ms.config | |
@@ -0,0 +1 @@ | |
+-I src -oldit | |
diff --git a/include/libyuv/row.h b/include/libyuv/row.h | |
index 3e5dd20..a1e4722 100644 | |
--- a/include/libyuv/row.h | |
+++ b/include/libyuv/row.h | |
@@ -625,7 +625,7 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 | |
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" | |
#endif // defined(__native_client__) && defined(__x86_64__) | |
-#if defined(__arm__) || defined(__aarch64__) | |
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) | |
#undef MEMACCESS | |
#if defined(__native_client__) | |
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" | |
diff --git a/libyuv.gyp b/libyuv.gyp | |
index f73a1a4..6c72007 100644 | |
--- a/libyuv.gyp | |
+++ b/libyuv.gyp | |
@@ -30,7 +30,7 @@ | |
'build_neon': 0, | |
'build_msa': 0, | |
'conditions': [ | |
- ['(target_arch == "armv7" or target_arch == "armv7s" or \ | |
+ ['(OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")) or (target_arch == "armv7" or target_arch == "armv7s" or \ | |
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ | |
and (arm_neon == 1 or arm_neon_optional == 1)', { | |
'build_neon': 1, | |
@@ -85,6 +85,35 @@ | |
# '-marm', # arm32 not thumb | |
], | |
}], | |
+ ['OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")', { | |
+ 'defines': [ | |
+ 'WINUWP', | |
+ '__ARM_NEON__', | |
+ ], | |
+ 'sources': [ | |
+ # sources. | |
+ 'source/arm_asm_macros.in', | |
+ 'source/compare_neon.asm', | |
+ 'source/rotate_neon.asm', | |
+ 'source/scale_neon.asm', | |
+ 'source/row_neon.asm' | |
+ ], | |
+ 'sources!': [ | |
+ # sources. | |
+ 'source/compare_neon.cc', | |
+ 'source/compare_neon64.cc', | |
+ 'source/rotate_neon.cc', | |
+ 'source/rotate_neon64.cc', | |
+ 'source/row_neon.cc', | |
+ 'source/row_neon64.cc', | |
+ 'source/scale_neon.cc', | |
+ 'source/scale_neon64.cc', | |
+ ], | |
+ }], | |
+ ], | |
+ 'include_dirs': [ | |
+ 'include', | |
+ '.', | |
], | |
}], | |
['build_msa != 0', { | |
diff --git a/libyuv_test.gyp b/libyuv_test.gyp | |
index 88860f5..abb3c89 100644 | |
--- a/libyuv_test.gyp | |
+++ b/libyuv_test.gyp | |
@@ -18,8 +18,8 @@ | |
'type': '<(gtest_target_type)', | |
'dependencies': [ | |
'libyuv.gyp:libyuv', | |
- 'testing/gtest.gyp:gtest', | |
- 'third_party/gflags/gflags.gyp:gflags', | |
+ '<(DEPTH)/testing/gtest.gyp:gtest', | |
+ '<(DEPTH)/third_party/gflags/gflags.gyp:gflags', | |
], | |
'direct_dependent_settings': { | |
'defines': [ | |
@@ -49,6 +49,11 @@ | |
'unit_test/video_common_test.cc', | |
], | |
'conditions': [ | |
+ ['OS=="win" and OS_RUNTIME=="winuwp"', { | |
+ 'defines': [ | |
+ 'WINUWP', | |
+ ], | |
+ }], | |
['OS=="linux"', { | |
'cflags': [ | |
'-fexceptions', | |
@@ -83,6 +88,7 @@ | |
[ '(target_arch == "armv7" or target_arch == "armv7s" \ | |
or (target_arch == "arm" and arm_version >= 7) \ | |
or target_arch == "arm64") \ | |
+ or winuwp_platform=="win_phone" or winuwp_platform=="win10_arm" \ | |
and (arm_neon == 1 or arm_neon_optional == 1)', { | |
'defines': [ | |
'LIBYUV_NEON' | |
@@ -185,7 +191,7 @@ | |
'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)', | |
}, | |
'includes': [ | |
- 'build/apk_test.gypi', | |
+ # 'build/apk_test.gypi', | |
], | |
'dependencies': [ | |
'libyuv_unittest', | |
diff --git a/source/arm_asm_macros.in b/source/arm_asm_macros.in | |
new file mode 100644 | |
index 0000000..eb54c4b | |
--- /dev/null | |
+++ b/source/arm_asm_macros.in | |
@@ -0,0 +1,22 @@ | |
+; | |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. | |
+; | |
+; Use of this source code is governed by a BSD-style license | |
+; that can be found in the LICENSE file in the root of the source | |
+; tree. An additional intellectual property rights grant can be found | |
+; in the file PATENTS. All contributing project authors may | |
+; be found in the AUTHORS file in the root of the source tree. | |
+; | |
+ | |
+ AREA |.text|, CODE, READONLY | |
+ | |
+ MACRO | |
+ MEMACCESS $base | |
+ ; Alternative of MEMACCESS macro defined in row.h. | |
+ ; Currently assembler source files are used only for Windows Phone (MS armasm compiler), | |
+ ; so this macro is empty. It is defined for code compatibility. | |
+ ; Eventually asm. source files should be used for all platforms, so some platforms might require | |
+ ; some implementation. See row.h (define MEMACCESS) for details. | |
+ MEND | |
+ | |
+ END | |
diff --git a/source/compare_neon.asm b/source/compare_neon.asm | |
new file mode 100644 | |
index 0000000..85251f1 | |
--- /dev/null | |
+++ b/source/compare_neon.asm | |
@@ -0,0 +1,56 @@ | |
+; | |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. | |
+; | |
+; Use of this source code is governed by a BSD-style license | |
+; that can be found in the LICENSE file in the root of the source | |
+; tree. An additional intellectual property rights grant can be found | |
+; in the file PATENTS. All contributing project authors may | |
+; be found in the AUTHORS file in the root of the source tree. | |
+; | |
+ | |
+ AREA |.text|, CODE, READONLY, ALIGN=2 | |
+ | |
+ GET source/arm_asm_macros.in | |
+ | |
+ EXPORT SumSquareError_NEON | |
+ | |
+SumSquareError_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_a | |
+ ; r1 = uint8* src_b | |
+ ; r3 = int count | |
+ ; output | |
+ ; r0 = int | |
+ vpush {q0, q1, q2, q3} | |
+ vpush {q8, q9, q10, q11} | |
+ | |
+ vmov.u8 q8, #0 | |
+ vmov.u8 q10, #0 | |
+ vmov.u8 q9, #0 | |
+ vmov.u8 q11, #0 | |
+loop | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.8 {q1}, [r1]! | |
+ subs r2, r2, #16 | |
+ vsubl.u8 q2, d0, d2 | |
+ vsubl.u8 q3, d1, d3 | |
+ vmlal.s16 q8, d4, d4 | |
+ vmlal.s16 q9, d6, d6 | |
+ vmlal.s16 q10, d5, d5 | |
+ vmlal.s16 q11, d7, d7 | |
+ bgt loop | |
+ | |
+ vadd.u32 q8, q8, q9 | |
+ vadd.u32 q10, q10, q11 | |
+ vadd.u32 q11, q8, q10 | |
+ vpaddl.u32 q1, q11 | |
+ vadd.u64 d0, d2, d3 | |
+ vmov.32 r0, d0[0] | |
+ vpop {q8, q9, q10, q11} | |
+ vpop {q0, q1, q2, q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ END | |
diff --git a/source/compare_neon.cc b/source/compare_neon.cc | |
index 49aa3b4..3148260 100644 | |
--- a/source/compare_neon.cc | |
+++ b/source/compare_neon.cc | |
@@ -21,6 +21,15 @@ extern "C" { | |
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ | |
!defined(__aarch64__) | |
+ /* !!! IMPORTANT: Following method has been ported to pure assembler to compare_neon.asm, | |
+ * because MS Visual Studio doesn't support inline assembler for ARM. | |
+ * | |
+ * ALL CHANGES IN METHOD IMPLEMENTATION HAS TO BE DONE ALSO IN compare_neon.asm | |
+ * | |
+ * Eventually, only pure assembler implementation should be used for all platforms | |
+ * to avoid code duplication. | |
+ */ | |
+ | |
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { | |
volatile uint32 sse; | |
asm volatile ( | |
diff --git a/source/cpu_id.cc b/source/cpu_id.cc | |
index afb5d28..fb462be 100644 | |
--- a/source/cpu_id.cc | |
+++ b/source/cpu_id.cc | |
@@ -29,6 +29,10 @@ | |
#include "libyuv/basic_types.h" // For CPU_X86 | |
+#if defined(WINWUP) && defined(_M_ARM) | |
+ #include <windows.h> | |
+#endif | |
+ | |
#ifdef __cplusplus | |
namespace libyuv { | |
extern "C" { | |
@@ -317,6 +321,13 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { | |
cpu_info &= ~kCpuHasNEON; | |
} | |
#endif // __arm__ | |
+#if defined (WINUWP) && defined(_M_ARM) | |
+ // Windows Runtime on ARM | |
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) | |
+ { | |
+ cpu_info_ = kCpuHasNEON; | |
+ } | |
+#endif | |
if (TestEnv("LIBYUV_DISABLE_ASM")) { | |
cpu_info = 0; | |
} | |
diff --git a/source/rotate_neon.asm b/source/rotate_neon.asm | |
new file mode 100644 | |
index 0000000..4d53eed | |
--- /dev/null | |
+++ b/source/rotate_neon.asm | |
@@ -0,0 +1,522 @@ | |
+; | |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. | |
+; | |
+; Use of this source code is governed by a BSD-style license | |
+; that can be found in the LICENSE file in the root of the source | |
+; tree. An additional intellectual property rights grant can be found | |
+; in the file PATENTS. All contributing project authors may | |
+; be found in the AUTHORS file in the root of the source tree. | |
+; | |
+ | |
+ AREA |.text|, CODE, READONLY, ALIGN=2 | |
+ | |
+ GET source/arm_asm_macros.in | |
+ | |
+ EXPORT TransposeWx8_NEON | |
+ EXPORT TransposeUVWx8_NEON | |
+ | |
+kVTbl4x4Transpose DCB 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
+kVTbl4x4TransposeDi DCB 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 | |
+ | |
+TransposeWx8_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src | |
+ ; r1 = int src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_stride | |
+ | |
+ push {r4-r6} | |
+ ldr r4, [sp, #12] ; load parameter int width | |
+ adr R6, kVTbl4x4Transpose | |
+ vpush {q0, q1, q2, q3} | |
+ | |
+ ; loops are on blocks of 8. loop will stop when | |
+ ; counter gets to or below 0. starting the counter | |
+ ; at w-8 allow for this | |
+ sub r4, #8 | |
+ | |
+ ; handle 8x8 blocks. this should be the majority of the plane | |
+1 | |
+ mov r5, r0 | |
+ | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d1}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d2}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d3}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d4}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d5}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d6}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d7}, [r5] | |
+ | |
+ vtrn.8 d1, d0 | |
+ vtrn.8 d3, d2 | |
+ vtrn.8 d5, d4 | |
+ vtrn.8 d7, d6 | |
+ | |
+ vtrn.16 d1, d3 | |
+ vtrn.16 d0, d2 | |
+ vtrn.16 d5, d7 | |
+ vtrn.16 d4, d6 | |
+ | |
+ vtrn.32 d1, d5 | |
+ vtrn.32 d0, d4 | |
+ vtrn.32 d3, d7 | |
+ vtrn.32 d2, d6 | |
+ | |
+ vrev16.8 q0, q0 | |
+ vrev16.8 q1, q1 | |
+ vrev16.8 q2, q2 | |
+ vrev16.8 q3, q3 | |
+ | |
+ mov r5, r2 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.8 {d1}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d0}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d3}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d2}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d5}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d4}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d7}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d6}, [r5] | |
+ | |
+ add r0, #8 ; src += 8 | |
+ add r2, r2, r3, lsl #3 ; dst += 8 * dst_stride | |
+ subs r4, #8 ; -= 8 | |
+ bge %b1 | |
+ | |
+ ; add 8 back to counter. if the result is 0 there are | |
+ ; no residuals. | |
+ adds r4, #8 | |
+ beq %f4 | |
+ | |
+ ; some residual, so between 1 and 7 lines left to transpose | |
+ cmp r4, #2 | |
+ blt %f3 | |
+ | |
+ cmp r4, #4 | |
+ blt %f2 | |
+ | |
+ ; 4x8 block | |
+ mov r5, r0 | |
+ MEMACCESS 0 | |
+ vld1.32 {d0[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d0[1]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d1[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d1[1]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d2[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d2[1]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d3[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d3[1]}, [r5] | |
+ | |
+ mov r5, r2 | |
+ | |
+ MEMACCESS(6) | |
+ vld1.8 {q3}, [r6] | |
+ | |
+ vtbl.8 d4, {d0, d1}, d6 | |
+ vtbl.8 d5, {d0, d1}, d7 | |
+ vtbl.8 d0, {d2, d3}, d6 | |
+ vtbl.8 d1, {d2, d3}, d7 | |
+ | |
+ ; TODO(frkoenig): Rework shuffle above to | |
+ ; write out with 4 instead of 8 writes. | |
+ MEMACCESS 0 | |
+ vst1.32 {d4[0]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d4[1]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d5[0]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d5[1]}, [r5] | |
+ | |
+ add r5, r2, #4 | |
+ MEMACCESS 0 | |
+ vst1.32 {d0[0]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d0[1]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d1[0]}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d1[1]}, [r5] | |
+ | |
+ add r0, #4 ; src += 4 | |
+ add r2, r2, r3, lsl #2 ; dst += 4 * dst_stride | |
+ subs r4, #4 ; w -= 4 | |
+ beq %f4 | |
+ | |
+ ; some residual, check to see if it includes a 2x8 block, | |
+ ; or less | |
+ cmp r4, #2 | |
+ blt %f3 | |
+ | |
+ ; 2x8 block | |
+2 | |
+ mov r5, r0 | |
+ MEMACCESS 0 | |
+ vld1.16 {d0[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d1[0]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d0[1]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d1[1]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d0[2]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d1[2]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d0[3]}, [r5], r1 | |
+ MEMACCESS 0 | |
+ vld1.16 {d1[3]}, [r5] | |
+ | |
+ vtrn.8 d0, d1 | |
+ | |
+ mov r5, r2 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.64 {d0}, [r5], r3 | |
+ MEMACCESS 0 | |
+ vst1.64 {d1}, [r5] | |
+ | |
+ add r0, #2 ; src += 2 | |
+ add r2, r2, r3, lsl #1 ; dst += 2 * dst_stride | |
+ subs r4, #2 ; w -= 2 | |
+ beq %f4 | |
+ | |
+ ; 1x8 block | |
+3 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[0]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[1]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[2]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[3]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[4]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[5]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[6]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d0[7]}, [r0] | |
+ | |
+ MEMACCESS(3) | |
+ vst1.64 {d0}, [r2] | |
+ | |
+4 | |
+ vpop {q0, q1, q2, q3} | |
+ pop {r4-r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+TransposeUVWx8_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src | |
+ ; r1 = int src_stride | |
+ ; r2 = uint8* dst_a | |
+ ; r3 = int dst_stride_a | |
+ push {r4-r8} | |
+ ldr r5, [sp, #20] ; load uint8* dst_b | |
+ ldr r6, [sp, #24] ; int dst_stride_b | |
+ ldr r7, [sp, #28] ; int width | |
+ adr R8, kVTbl4x4TransposeDi | |
+ vpush {q0, q1, q2, q3} | |
+ vpush {q8, q9, q10, q11} | |
+ | |
+ ; loops are on blocks of 8. loop will stop when | |
+ ; counter gets to or below 0. starting the counter | |
+ ; at w-8 allow for this | |
+ sub r7, #8 | |
+ | |
+ ; handle 8x8 blocks. this should be the majority of the plane | |
+1 | |
+ mov r4, r0 | |
+ | |
+ MEMACCESS 0 | |
+ vld2.8 {d0, d1}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d2, d3}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d4, d5}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d6, d7}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d16, d17}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d18, d19}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d20, d21}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d22, d23}, [r4] | |
+ | |
+ vtrn.8 q1, q0 | |
+ vtrn.8 q3, q2 | |
+ vtrn.8 q9, q8 | |
+ vtrn.8 q11, q10 | |
+ | |
+ vtrn.16 q1, q3 | |
+ vtrn.16 q0, q2 | |
+ vtrn.16 q9, q11 | |
+ vtrn.16 q8, q10 | |
+ | |
+ vtrn.32 q1, q9 | |
+ vtrn.32 q0, q8 | |
+ vtrn.32 q3, q11 | |
+ vtrn.32 q2, q10 | |
+ | |
+ vrev16.8 q0, q0 | |
+ vrev16.8 q1, q1 | |
+ vrev16.8 q2, q2 | |
+ vrev16.8 q3, q3 | |
+ vrev16.8 q8, q8 | |
+ vrev16.8 q9, q9 | |
+ vrev16.8 q10, q10 | |
+ vrev16.8 q11, q11 | |
+ | |
+ mov r4, r2 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.8 {d2}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d0}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d6}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d4}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d18}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d16}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d22}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.8 {d20}, [r4] | |
+ | |
+ mov r4, r5 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.8 {d3}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d1}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d7}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d5}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d19}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d17}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d23}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.8 {d21}, [r4] | |
+ | |
+ add r0, #8*2 ; src += 8*2 | |
+ add r2, r2, r3, lsl #3 ; dst_a += 8 * dst_stride_a | |
+ add r5, r5, r6, lsl #3 ; dst_b += 8 * dst_stride_b | |
+ subs r7, #8 ; w -= 8 | |
+ bge %b1 | |
+ | |
+ ; add 8 back to counter. if the result is 0 there are | |
+ ; no residuals. | |
+ adds r7, #8 | |
+ beq %f4 | |
+ | |
+ ; some residual, so between 1 and 7 lines left to transpose | |
+ cmp r7, #2 | |
+ blt %f3 | |
+ | |
+ cmp r7, #4 | |
+ blt %f2 | |
+ | |
+ ; TODO(frkoenig): Clean this up | |
+ ; 4x8 block | |
+ mov r4, r0 | |
+ MEMACCESS 0 | |
+ vld1.64 {d0}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d1}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d2}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d3}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d4}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d5}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d6}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld1.64 {d7}, [r4] | |
+ | |
+ MEMACCESS 8 | |
+ vld1.8 {q15}, [r8] | |
+ | |
+ vtrn.8 q0, q1 | |
+ vtrn.8 q2, q3 | |
+ | |
+ vtbl.8 d16, {d0, d1}, d30 | |
+ vtbl.8 d17, {d0, d1}, d31 | |
+ vtbl.8 d18, {d2, d3}, d30 | |
+ vtbl.8 d19, {d2, d3}, d31 | |
+ vtbl.8 d20, {d4, d5}, d30 | |
+ vtbl.8 d21, {d4, d5}, d31 | |
+ vtbl.8 d22, {d6, d7}, d30 | |
+ vtbl.8 d23, {d6, d7}, d31 | |
+ | |
+ mov r4, r2 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.32 {d16[0]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d16[1]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d17[0]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d17[1]}, [r4], r3 | |
+ | |
+ add r4, r2, #4 | |
+ MEMACCESS 0 | |
+ vst1.32 {d20[0]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d20[1]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d21[0]}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.32 {d21[1]}, [r4] | |
+ | |
+ mov r4, r5 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.32 {d18[0]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d18[1]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d19[0]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d19[1]}, [r4], r6 | |
+ | |
+ add r4, r5, #4 | |
+ MEMACCESS 0 | |
+ vst1.32 {d22[0]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d22[1]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d23[0]}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.32 {d23[1]}, [r4] | |
+ | |
+ add r0, #4*2 ; src += 4 * 2 | |
+ add r2, r2, r3, lsl #2 ; dst_a += 4 * dst_stride_a | |
+ add r5, r5, r6, lsl #2 ; dst_b += 4 * dst_stride_b | |
+ subs r7, #4 ; w -= 4 | |
+ beq %f4 | |
+ | |
+ ; some residual, check to see if it includes a 2x8 block, | |
+ ; or less | |
+ cmp r7, #2 | |
+ blt %f3 | |
+ | |
+ ; 2x8 block | |
+2 | |
+ mov r4, r0 | |
+ MEMACCESS 0 | |
+ vld2.16 {d0[0], d2[0]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d1[0], d3[0]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d0[1], d2[1]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d1[1], d3[1]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d0[2], d2[2]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d1[2], d3[2]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d0[3], d2[3]}, [r4], r1 | |
+ MEMACCESS 0 | |
+ vld2.16 {d1[3], d3[3]}, [r4] | |
+ | |
+ vtrn.8 d0, d1 | |
+ vtrn.8 d2, d3 | |
+ | |
+ mov r4, r2 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.64 {d0}, [r4], r3 | |
+ MEMACCESS 0 | |
+ vst1.64 {d2}, [r4] | |
+ | |
+ mov r4, r5 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.64 {d1}, [r4], r6 | |
+ MEMACCESS 0 | |
+ vst1.64 {d3}, [r4] | |
+ | |
+ add r0, #2*2 ; src += 2 * 2 | |
+ add r2, r2, r3, lsl #1 ; dst_a += 2 * dst_stride_a | |
+ add r5, r5, r6, lsl #1 ; dst_b += 2 * dst_stride_b | |
+ subs r7, #2 ; w -= 2 | |
+ beq %f4 | |
+ | |
+ ; 1x8 block | |
+3 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[0], d1[0]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[1], d1[1]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[2], d1[2]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[3], d1[3]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[4], d1[4]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[5], d1[5]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[6], d1[6]}, [r0], r1 | |
+ MEMACCESS 1 | |
+ vld2.8 {d0[7], d1[7]}, [r0] | |
+ | |
+ MEMACCESS(3) | |
+ vst1.64 {d0}, [r2] | |
+ MEMACCESS(5) | |
+ vst1.64 {d1}, [r5] | |
+4 | |
+ | |
+ vpop {q8, q9, q10, q11} | |
+ vpop {q0, q1, q2, q3} | |
+ pop {r4-r8} | |
+ bx lr | |
+ ENDP | |
+ | |
+ END | |
+ | |
+ | |
+ | |
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc | |
index 41ec34e..5c6f938 100644 | |
--- a/source/rotate_neon.cc | |
+++ b/source/rotate_neon.cc | |
@@ -18,6 +18,15 @@ namespace libyuv { | |
extern "C" { | |
#endif | |
+/* !!! IMPORTANT: Following methods has been ported to pure assembler to rotate_neon.asm, | |
+* because MS Visual Studio doesn't support inline assembler for ARM. | |
+* | |
+* ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN rotate_neon.asm | |
+* | |
+* Eventually, only pure assembler implementation should be used for all platforms | |
+* to avoid code duplication. | |
+*/ | |
+ | |
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ | |
!defined(__aarch64__) | |
diff --git a/source/row_neon.asm b/source/row_neon.asm | |
new file mode 100644 | |
index 0000000..b623358 | |
--- /dev/null | |
+++ b/source/row_neon.asm | |
@@ -0,0 +1,3435 @@ | |
+; | |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. | |
+; | |
+; Use of this source code is governed by a BSD-style license | |
+; that can be found in the LICENSE file in the root of the source | |
+; tree. An additional intellectual property rights grant can be found | |
+; in the file PATENTS. All contributing project authors may | |
+; be found in the AUTHORS file in the root of the source tree. | |
+; | |
+ | |
+ AREA |.text|, CODE, READONLY, ALIGN=2 | |
+ | |
+ GET source/arm_asm_macros.in | |
+ | |
+ EXPORT I444ToARGBRow_NEON | |
+ EXPORT I422ToARGBRow_NEON | |
+ EXPORT I411ToARGBRow_NEON | |
+ EXPORT I422ToBGRARow_NEON | |
+ EXPORT I422ToABGRRow_NEON | |
+ EXPORT I422AlphaToARGBRow_NEON | |
+ EXPORT I422ToRGB24Row_NEON | |
+ EXPORT I422ToRAWRow_NEON | |
+ EXPORT I422ToRGBARow_NEON | |
+ EXPORT I422ToARGB4444Row_NEON | |
+ EXPORT I422ToARGB1555Row_NEON | |
+ EXPORT I422ToRGB565Row_NEON | |
+ EXPORT I400ToARGBRow_NEON | |
+ EXPORT J400ToARGBRow_NEON | |
+ EXPORT ARGBToRGB24Row_NEON | |
+ EXPORT RAWToRGB24Row_NEON | |
+ EXPORT ARGBToRAWRow_NEON | |
+ EXPORT ARGBToRGB565Row_NEON | |
+ EXPORT ARGBToARGB1555Row_NEON | |
+ EXPORT YUY2ToARGBRow_NEON | |
+ EXPORT UYVYToARGBRow_NEON | |
+ EXPORT ARGBToARGB4444Row_NEON | |
+ EXPORT NV12ToARGBRow_NEON | |
+ EXPORT NV21ToARGBRow_NEON | |
+ EXPORT NV12ToRGB565Row_NEON | |
+ EXPORT NV21ToRGB565Row_NEON | |
+ EXPORT SplitUVRow_NEON | |
+ EXPORT MergeUVRow_NEON | |
+ EXPORT SetRow_NEON | |
+ EXPORT CopyRow_NEON | |
+ EXPORT ARGBSetRow_NEON | |
+ EXPORT MirrorRow_NEON | |
+ EXPORT MirrorUVRow_NEON | |
+ EXPORT ARGBMirrorRow_NEON | |
+ EXPORT RGB24ToARGBRow_NEON | |
+ EXPORT RAWToARGBRow_NEON | |
+ EXPORT RGB565ToARGBRow_NEON | |
+ EXPORT ARGB1555ToARGBRow_NEON | |
+ EXPORT ARGB4444ToARGBRow_NEON | |
+ EXPORT RGBAToUVRow_NEON | |
+ EXPORT ABGRToUVRow_NEON | |
+ EXPORT ABGRToYRow_NEON | |
+ EXPORT RGBAToYRow_NEON | |
+ EXPORT RGB24ToYRow_NEON | |
+ EXPORT ARGB1555ToUVRow_NEON | |
+ EXPORT ARGB4444ToUVRow_NEON | |
+ EXPORT RGB565ToYRow_NEON | |
+ EXPORT RGB565ToUVRow_NEON | |
+ EXPORT ARGB1555ToYRow_NEON | |
+ EXPORT ARGB4444ToYRow_NEON | |
+ EXPORT BGRAToYRow_NEON | |
+ EXPORT ARGBToUV411Row_NEON | |
+ EXPORT ARGBToUV422Row_NEON | |
+ EXPORT ARGBToUV444Row_NEON | |
+ EXPORT YUY2ToUV422Row_NEON | |
+ EXPORT UYVYToUV422Row_NEON | |
+ EXPORT ARGBToBayerGGRow_NEON | |
+ EXPORT ARGBShuffleRow_NEON | |
+ EXPORT ARGBToUVJRow_NEON | |
+ EXPORT BGRAToUVRow_NEON | |
+ EXPORT ABGRToUVRow_NEON | |
+ EXPORT RGBAToUVRow_NEON | |
+ EXPORT ARGBExtractAlphaRow_NEON | |
+ EXPORT ARGBToYJRow_NEON | |
+ EXPORT I422ToUYVYRow_NEON | |
+ EXPORT I422ToYUY2Row_NEON | |
+ EXPORT ARGBToUVRow_NEON | |
+ EXPORT ARGBToYRow_NEON | |
+ EXPORT RAWToUVRow_NEON | |
+ EXPORT RAWToYRow_NEON | |
+ EXPORT RGB24ToUVRow_NEON | |
+ EXPORT UYVYToUVRow_NEON | |
+ EXPORT UYVYToYRow_NEON | |
+ EXPORT ARGBToRGB565DitherRow_NEON | |
+ EXPORT YUY2ToYRow_NEON | |
+ EXPORT YUY2ToUVRow_NEON | |
+ EXPORT SobelToPlaneRow_NEON | |
+ EXPORT SobelRow_NEON | |
+ EXPORT ARGBSubtractRow_NEON | |
+ EXPORT ARGBAddRow_NEON | |
+ EXPORT ARGBAttenuateRow_NEON | |
+ EXPORT ARGBQuantizeRow_NEON | |
+ EXPORT ARGBShadeRow_NEON | |
+ EXPORT ARGBGrayRow_NEON | |
+ EXPORT ARGBSepiaRow_NEON | |
+ EXPORT ARGBColorMatrixRow_NEON | |
+ EXPORT ARGBBlendRow_NEON | |
+ EXPORT InterpolateRow_NEON | |
+ EXPORT ARGBMultiplyRow_NEON | |
+ EXPORT SobelXRow_NEON | |
+ EXPORT SobelYRow_NEON | |
+ EXPORT SobelXYRow_NEON | |
+ | |
+ | |
+; ------- CONSTANTS --------------------- | |
+ | |
+; YUV to RGB conversion constants. | |
+; Y contribution to R,G,B. Scale and bias. | |
+YG EQU 18997 ; round(1.164 * 64 * 256 * 256 / 257) | |
+YGB EQU 1160 ; 1.164 * 64 * 16 - adjusted for even error distribution | |
+ | |
+; U and V contributions to R,G,B | |
+UB EQU -128 ; -min(128, round(2.018 * 64)) | |
+UG EQU 25 ; -round(-0.391 * 64) | |
+VG EQU 52 ; -round(-0.813 * 64) | |
+VR EQU -102 ; -round(1.596 * 64) | |
+ | |
+; Bias values to subtract 16 from Y and 128 from U and V. | |
+BB EQU UB * 128 - YGB | |
+BG EQU UG * 128 + VG * 128 - YGB | |
+BR EQU VR * 128 - YGB | |
+ | |
+ | |
+; ------- ARRAYS ------------------------ | |
+ | |
+kUVToRB DCB 128, 128, 128, 128, 102, 102, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0 | |
+kUVToG DCB 25, 25, 25, 25, 52, 52, 52, 52, 0, 0, 0, 0, 0, 0, 0, 0 | |
+kUVBiasBGR DCW BB, BG, BR, 0, 0, 0, 0, 0 | |
+kYToRgb DCD 0x0101 * YG, 0, 0, 0 | |
+ | |
+; ------- MACROS ------------------------ | |
+ | |
+ MACRO | |
+ YUV422TORGB_SETUP_REG | |
+ adr r5, kUVToRB | |
+ vld1.8 {d24}, [r5] | |
+ adr r5, kUVToG | |
+ vld1.8 {d25}, [r5] | |
+ adr r5, kUVBiasBGR | |
+ vld1.16 {d26[], d27[]}, [r5]! | |
+ vld1.16 {d8[], d9[]}, [r5]! | |
+ vld1.16 {d28[], d29[]}, [r5] | |
+ adr r5, kYToRgb | |
+ vld1.32 {d30[], d31[]}, [r5] | |
+ MEND | |
+ | |
+ ; Read 8 Y, 4 U and 4 V from 422 | |
+ MACRO | |
+ READYUV422 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.32 {d2[0]}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.32 {d2[1]}, [r2]! | |
+ MEND | |
+ | |
+ ; Read 8 Y, 2 U and 2 V from 422 | |
+ MACRO | |
+ READYUV411 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.16 {d2[0]}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.16 {d2[1]}, [r2]! | |
+ vmov.u8 d3, d2 | |
+ vzip.u8 d2, d3 | |
+ MEND | |
+ | |
+ ; Read 8 Y, 8 U and 8 V from 444 | |
+ MACRO | |
+ READYUV444 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.8 {d2}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.8 {d3}, [r2]! | |
+ vpaddl.u8 q1, q1 | |
+ vrshrn.u16 d2, q1, #1 | |
+ MEND | |
+ | |
+ ; Read 8 Y and 4 VU from NV21 | |
+ MACRO | |
+ READNV21 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.8 {d2}, [r1]! | |
+ vmov.u8 d3, d2 ; split odd/even uv apart | |
+ vuzp.u8 d3, d2 | |
+ vtrn.u32 d2, d3 | |
+ MEND | |
+ | |
+ ; Read 8 Y, and set 4 U and 4 V to 128 | |
+ MACRO | |
+ READYUV400 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ vmov.u8 d2, #128 | |
+ MEND | |
+ | |
+ ; Read 8 Y and 4 UV from NV12 | |
+ MACRO | |
+ READNV12 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! | |
+ MEMACCESS 1 | |
+ vld1.8 {d2}, [r1]! | |
+ vmov.u8 d3, d2 ; split odd/even uv apart | |
+ vuzp.u8 d2, d3 | |
+ vtrn.u32 d2, d3 | |
+ MEND | |
+ | |
+ ; Read 8 YUY2 | |
+ MACRO | |
+ READYUY2 | |
+ MEMACCESS 0 | |
+ vld2.8 {d0, d2}, [r0]! | |
+ vmov.u8 d3, d2 | |
+ vuzp.u8 d2, d3 | |
+ vtrn.u32 d2, d3 | |
+ MEND | |
+ | |
+ ; Read 8 UYVY | |
+ MACRO | |
+ READUYVY | |
+ MEMACCESS 0 | |
+ vld2.8 {d2, d3}, [r0]! | |
+ vmov.u8 d0, d3 | |
+ vmov.u8 d3, d2 | |
+ vuzp.u8 d2, d3 | |
+ vtrn.u32 d2, d3 | |
+ MEND | |
+ | |
+ MACRO | |
+ ARGBTOARGB4444 | |
+ vshr.u8 d20, d20, #4 ; B | |
+ vbic.32 d21, d21, d4 ; G | |
+ vshr.u8 d22, d22, #4 ; R | |
+ vbic.32 d23, d23, d4 ; A | |
+ vorr d0, d20, d21 ; BG | |
+ vorr d1, d22, d23 ; RA | |
+ vzip.u8 d0, d1 ; BGRA | |
+ MEND | |
+ | |
+ MACRO | |
+ ARGBTOARGB1555 | |
+ vshll.u8 q0, d23, #8 ; A | |
+ vshll.u8 q8, d22, #8 ; R | |
+ vshll.u8 q9, d21, #8 ; G | |
+ vshll.u8 q10, d20, #8 ; B | |
+ vsri.16 q0, q8, #1 ; AR | |
+ vsri.16 q0, q9, #6 ; ARG | |
+ vsri.16 q0, q10, #11 ; ARGB | |
+ MEND | |
+ | |
+ MACRO | |
+ ARGBTORGB565 | |
+ vshll.u8 q0, d22, #8 ; R | |
+ vshll.u8 q8, d21, #8 ; G | |
+ vshll.u8 q9, d20, #8 ; B | |
+ vsri.16 q0, q8, #5 ; RG | |
+ vsri.16 q0, q9, #11 ; RGB | |
+ MEND | |
+ | |
+ MACRO | |
+ YUV422TORGB | |
+ vmull.u8 q8, d2, d24 ; u/v B/R component | |
+ vmull.u8 q9, d2, d25 ; u/v G component | |
+ vmovl.u8 q0, d0 ; Y | |
+ vmovl.s16 q10, d1 | |
+ vmovl.s16 q0, d0 | |
+ vmul.s32 q10, q10, q15 | |
+ vmul.s32 q0, q0, q15 | |
+ vqshrun.s32 d0, q0, #16 | |
+ vqshrun.s32 d1, q10, #16 ; Y | |
+ vadd.s16 d18, d19 | |
+ vshll.u16 q1, d16, #16 ; Replicate u * UB | |
+ vshll.u16 q10, d17, #16 ; Replicate v * VR | |
+ vshll.u16 q3, d18, #16 ; Replicate (v*VG + u*UG) | |
+ vaddw.u16 q1, q1, d16 | |
+ vaddw.u16 q10, q10, d17 | |
+ vaddw.u16 q3, q3, d18 | |
+ vqadd.s16 q8, q0, q13 ; B */ | |
+ vqadd.s16 q9, q0, q14 ; R */ | |
+ vqadd.s16 q0, q0, q4 ; G */ | |
+ vqadd.s16 q8, q8, q1 ; B */ | |
+ vqadd.s16 q9, q9, q10 ; R */ | |
+ vqsub.s16 q0, q0, q3 ; G */ | |
+ vqshrun.s16 d20, q8, #6 ; B */ | |
+ vqshrun.s16 d22, q9, #6 ; R */ | |
+ vqshrun.s16 d21, q0, #6 ; G */ | |
+ MEND | |
+ | |
+ MACRO | |
+ RGB565TOARGB | |
+ vshrn.u16 d6, q0, #5 ; G xxGGGGGG | |
+ vuzp.u8 d0, d1 ; d0 xxxBBBBB RRRRRxxx | |
+ vshl.u8 d6, d6, #2 ; G GGGGGG00 upper 6 | |
+ vshr.u8 d1, d1, #3 ; R 000RRRRR lower 5 | |
+ vshl.u8 q0, q0, #3 ; B,R BBBBB000 upper 5 | |
+ vshr.u8 q2, q0, #5 ; B,R 00000BBB lower 3 | |
+ vorr.u8 d0, d0, d4 ; B | |
+ vshr.u8 d4, d6, #6 ; G 000000GG lower 2 | |
+ vorr.u8 d2, d1, d5 ; R | |
+ vorr.u8 d1, d4, d6 ; G | |
+ MEND | |
+ | |
+ MACRO | |
+ ARGB1555TOARGB | |
+ vshrn.u16 d7, q0, #8 ; A Arrrrrxx | |
+ vshr.u8 d6, d7, #2 ; R xxxRRRRR | |
+ vshrn.u16 d5, q0, #5 ; G xxxGGGGG | |
+ vmovn.u16 d4, q0 ; B xxxBBBBB | |
+ vshr.u8 d7, d7, #7 ; A 0000000A | |
+ vneg.s8 d7, d7 ; A AAAAAAAA upper 8 | |
+ vshl.u8 d6, d6, #3 ; R RRRRR000 upper 5 | |
+ vshr.u8 q1, q3, #5 ; R,A 00000RRR lower 3 | |
+ vshl.u8 q0, q2, #3 ; B,G BBBBB000 upper 5 | |
+ vshr.u8 q2, q0, #5 ; B,G 00000BBB lower 3 | |
+ vorr.u8 q1, q1, q3 ; R,A | |
+ vorr.u8 q0, q0, q2 ; B,G | |
+ MEND | |
+ | |
+ MACRO | |
+ ARGB4444TOARGB | |
+ vuzp.u8 d0, d1 ; d0 BG, d1 RA | |
+ vshl.u8 q2, q0, #4 ; B,R BBBB0000 | |
+ vshr.u8 q1, q0, #4 ; G,A 0000GGGG | |
+ vshr.u8 q0, q2, #4 ; B,R 0000BBBB | |
+ vorr.u8 q0, q0, q2 ; B,R BBBBBBBB | |
+ vshl.u8 q2, q1, #4 ; G,A GGGG0000 | |
+ vorr.u8 q1, q1, q2 ; G,A GGGGGGGG | |
+ vswp.u8 d1, d2 ; B,R,G,A -> B,G,R,A | |
+ MEND | |
+ | |
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | |
+ MACRO | |
+ RGBTOUV $QB, $QG, $QR | |
+ vmul.s16 q8, $QB , q10 ; B | |
+ vmls.s16 q8, $QG , q11 ; G | |
+ vmls.s16 q8, $QR , q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ vmul.s16 q9, $QR , q10 ; R | |
+ vmls.s16 q9, $QG , q14 ; G | |
+ vmls.s16 q9, $QB , q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ MEND | |
+ | |
+ ; RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. | |
+ MACRO | |
+ RGB555TOARGB | |
+ vshrn.u16 d6, q0, #5 ; G xxxGGGGG | |
+ vuzp.u8 d0, d1 ; d0 xxxBBBBB xRRRRRxx | |
+ vshl.u8 d6, d6, #3 ; G GGGGG000 upper 5 | |
+ vshr.u8 d1, d1, #2 ; R 00xRRRRR lower 5 | |
+ vshl.u8 q0, q0, #3 ; B,R BBBBB000 upper 5 | |
+ vshr.u8 q2, q0, #5 ; B,R 00000BBB lower 3 | |
+ vorr.u8 d0, d0, d4 ; B | |
+ vshr.u8 d4, d6, #5 ; G 00000GGG lower 3 | |
+ vorr.u8 d2, d1, d5 ; R | |
+ vorr.u8 d1, d4, d6 ; G | |
+ MEND | |
+ | |
+ | |
+; ----- METHODS --------------------------------------- | |
+ | |
+I444ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV444 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d20, d21, d22, d23}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d20, d21, d22, d23}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422AlphaToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = const uint8* src_a | |
+ ; r4 = uint8* dst_argb | |
+ push {r5, r6} | |
+ ldr r5, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ MEMACCESS 3 | |
+ vld1.8 {d23}, [r3]! | |
+ subs r5, r5, #8 | |
+ MEMACCESS 4 | |
+ vst4.8 {d20, d21, d22, d23}, [r4]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5, r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToRGBARow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d19, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d19, d20, d21, d22}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+I411ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV411 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d20, d21, d22, d23}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+I422ToBGRARow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_bgra | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vswp.u8 d20, d22 | |
+ vmov.u8 d19, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d19, d20, d21, d22}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+I422ToABGRRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_abgr | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vswp.u8 d20, d22 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 3 | |
+ vst4.8 {d20, d21, d22, d23}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToRGB24Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_rgb24 | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ MEMACCESS 3 | |
+ vst3.8 {d20, d21, d22}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToRAWRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_raw | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vswp.u8 d20, d22 | |
+ MEMACCESS 3 | |
+ vst3.8 {d20, d21, d22}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToARGB4444Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb4444 | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+ vmov.u8 d4, #0x0f ; bits to clear with vbic. | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d23, #255 | |
+ ARGBTOARGB4444 | |
+ MEMACCESS 3 | |
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB4444. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToARGB1555Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_argb1555 | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ vmov.u8 d23, #255 | |
+ ARGBTOARGB1555 | |
+ MEMACCESS 3 | |
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB1555. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToRGB565Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_rgb565 | |
+ push {r4, r5} | |
+ ldr r4, [sp,#8] ; int width | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV422 | |
+ YUV422TORGB | |
+ subs r4, r4, #8 | |
+ ARGBTORGB565 | |
+ MEMACCESS 3 | |
+ vst1.8 {q0}, [r3]! ; store 8 pixels ARGB1555. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r4, r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+I400ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_rgb24 | |
+ ; r2 = width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUV400 | |
+ YUV422TORGB | |
+ subs r2, r2, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 1 | |
+ vst4.8 {d20, d21, d22, d23}, [r1]! | |
+ bgt %b1 | |
+ | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+J400ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = width | |
+ push {r5} | |
+ vpush {d20 - d23} | |
+ | |
+ vmov.u8 d23, #255 | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d20}, [r0]! | |
+ vmov d21, d20 | |
+ vmov d22, d20 | |
+ subs r2, r2, #8 | |
+ MEMACCESS 1 | |
+ vst4.8 {d20, d21, d22, d23}, [r1]! | |
+ bgt %b1 | |
+ | |
+ | |
+ vpop {d20 - d23} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+ARGBToRGB24Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_raw | |
+ ; r2 = pix | |
+ vpush {d1 - d4} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d2, d3, d4}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ MEMACCESS 1 | |
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of RGB24. | |
+ bgt %b1 | |
+ | |
+ vpop {d1 - d4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToRAWRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = pix | |
+ vpush {d1 - d4} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d2, d3, d4}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vswp.u8 d1, d3 ; swap R, B | |
+ MEMACCESS 1 | |
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of RAW. | |
+ bgt %b1 | |
+ | |
+ vpop {d1 - d4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToRGB565Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_rgb565 | |
+ ; r2 = pix | |
+ vpush {q0} | |
+ vpush {q8 - q11} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGBTORGB565 | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r1]! ; store 8 pixels RGB565. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q11} | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToARGB1555Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb1555 | |
+ ; r2 = pix | |
+ vpush {q0} | |
+ vpush {q8 - q11} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGBTOARGB1555 | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r1]! ; store 8 pixels ARGB1555. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q11} | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToARGB4444Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb4444 | |
+ ; r2 = pix | |
+ vpush {q0} | |
+ vpush {q8 - q11} | |
+ | |
+ vmov.u8 d4, #0x0f ; bits to clear with vbic. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGBTOARGB4444 | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r1]! ; store 8 pixels ARGB4444. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q11} | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+NV12ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_uv | |
+ ; r2 = uint8* dst_argb | |
+ ; r3 = int width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+ | |
+1 | |
+ READNV12 | |
+ YUV422TORGB | |
+ subs r3, r3, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 2 | |
+ vst4.8 {d20, d21, d22, d23}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+NV21ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_uv | |
+ ; r2 = uint8* dst_argb | |
+ ; r3 = int width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+ | |
+1 | |
+ READNV21 | |
+ YUV422TORGB | |
+ subs r3, r3, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 2 | |
+ vst4.8 {d20, d21, d22, d23}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+NV12ToRGB565Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_uv | |
+ ; r2 = uint8* dst_rgb565 | |
+ ; r3 = int width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+ | |
+1 | |
+ READNV12 | |
+ YUV422TORGB | |
+ subs r3, r3, #8 | |
+ ARGBTORGB565 | |
+ MEMACCESS 2 | |
+ vst1.8 {q0}, [r2]! ; store 8 pixels RGB565. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+NV21ToRGB565Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_uv | |
+ ; r2 = uint8* dst_rgb565 | |
+ ; r3 = int width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+ | |
+1 | |
+ READNV21 | |
+ YUV422TORGB | |
+ subs r3, r3, #8 | |
+ ARGBTORGB565 | |
+ MEMACCESS 2 | |
+ vst1.8 {q0}, [r2]! ; store 8 pixels RGB565. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+YUY2ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_yuy2 | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READYUY2 | |
+ YUV422TORGB | |
+ subs r2, r2, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 1 | |
+ vst4.8 {d20, d21, d22, d23}, [r1]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+UYVYToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uyvy | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = width | |
+ push {r5} | |
+ vpush {q0 - q4} | |
+ vpush {q8 - q15} | |
+ | |
+ YUV422TORGB_SETUP_REG | |
+1 | |
+ READUYVY | |
+ YUV422TORGB | |
+ subs r2, r2, #8 | |
+ vmov.u8 d23, #255 | |
+ MEMACCESS 1 | |
+ vst4.8 {d20, d21, d22, d23}, [r1]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q4} | |
+ pop {r5} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | |
+SplitUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uv | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int width | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {q0, q1}, [r0]! ; load 16 pairs of UV | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r1]! ; store U | |
+ MEMACCESS 2 | |
+ vst1.8 {q1}, [r2]! ; store V | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Reads 16 U's and V's and writes out 16 pairs of UV | |
+MergeUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_u | |
+ ; r1 = uint8* src_v | |
+ ; r2 = uint8* dst_uv | |
+ ; r3 = int width | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load U | |
+ MEMACCESS 1 | |
+ vld1.8 {q1}, [r1]! ; load V | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ MEMACCESS 2 | |
+ vst2.u8 {q0, q1}, [r2]! ; store 16 pairs of UV | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | |
+CopyRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src | |
+ ; r1 = uint8* dst | |
+ ; r2 = int count | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0, d1, d2, d3}, [r0]! ; load 32 | |
+ subs r2, r2, #32 ; 32 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {d0, d1, d2, d3}, [r1]! ; store 32 | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; SetRow writes 'count' bytes using an 8 bit value repeated | |
+SetRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src | |
+ ; r1 = uint8* v8 | |
+ ; r2 = int count | |
+ vpush {q0} | |
+ | |
+ vdup.8 q0, r1 ; duplicate 16 bytes | |
+1 | |
+ subs r2, r2, #16 ; 16 bytes per loop | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! ; store | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+; ARGBSetRow writes 'count' pixels using an 32 bit value repeated. | |
+ARGBSetRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* dst | |
+ ; r1 = uint8* v32 | |
+ ; r2 = int count | |
+ vpush {q0} | |
+ | |
+ vdup.u32 q0, r1 ; duplicate 4 ints | |
+1 | |
+ subs r2, r2, #4 ; 4 pixels per loop | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! ; store | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+MirrorRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src | |
+ ; r1 = uint8* dst | |
+ ; r2 = int width | |
+ push {r3} | |
+ vpush {q0} | |
+ ; Start at end of source row. | |
+ mov r3, #-16 | |
+ add r0, r0, r2 | |
+ sub r0, #16 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0], r3 ; src -= 16 | |
+ subs r2, #16 ; 16 pixels per loop. | |
+ vrev64.8 q0, q0 | |
+ MEMACCESS 1 | |
+ vst1.8 {d1}, [r1]! ; dst += 16 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ pop {r3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+MirrorUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uv | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_ | |
+ ; r3 = uint8* width | |
+ push {r12} | |
+ vpush {q0} | |
+ ; Start at end of source row. | |
+ mov r12, #-16 | |
+ add r0, r0, r3, lsl #1 | |
+ sub r0, #16 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d0, d1}, [r0], r12 ; src -= 16 | |
+ subs r3, #8 ; 8 pixels per loop. | |
+ vrev64.8 q0, q0 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; dst += 8 | |
+ MEMACCESS 2 | |
+ vst1.8 {d1}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ pop {r12} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBMirrorRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src | |
+ ; r1 = uint8* dst | |
+ ; r2 = int width | |
+ push {r3} | |
+ vpush {q0} | |
+ | |
+ ; Start at end of source row. | |
+ mov r3, #-16 | |
+ add r0, r0, r2, lsl #2 | |
+ sub r0, #16 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0], r3 ; src -= 16 | |
+ subs r2, #4 ; 4 pixels per loop. | |
+ vrev64.32 q0, q0 | |
+ MEMACCESS 1 | |
+ vst1.8 {d1}, [r1]! ; dst += 16 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ pop {r3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+RGB24ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb24 | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int pix | |
+ vpush {d1 - d4} | |
+ vmov.u8 d4, #255 ; Alpha | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RGB24. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ MEMACCESS 1 | |
+ vst4.8 {d1, d2, d3, d4}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {d1 - d4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RAWToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_raw | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int pix | |
+ vpush {d1 - d4} | |
+ vmov.u8 d4, #255 ; Alpha | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RAW. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vswp.u8 d1, d3 ; swap R, B | |
+ MEMACCESS 1 | |
+ vst4.8 {d1, d2, d3, d4}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {d1 - d4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RAWToRGB24Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_raw | |
+ ; r1 = uint8* dst_rgb24 | |
+ ; r2 = int width | |
+ vpush {d1 - d4} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d1, d2, d3}, [r0]! ; load 8 pixels of RAW. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vswp.u8 d1, d3 ; swap R, B | |
+ MEMACCESS 1 | |
+ vst3.8 {d1, d2, d3}, [r1]! ; store 8 pixels of b g r. | |
+ bgt %b1 | |
+ | |
+ vpop {d1 - d4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RGB565ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb565 | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vmov.u8 d3, #255 ; Alpha | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ RGB565TOARGB | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGB1555ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb1555 | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vmov.u8 d3, #255 ; Alpha | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGB1555TOARGB | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGB4444ToARGBRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb4444 | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int pix | |
+ vpush {q0 - q2} | |
+ vmov.u8 d3, #255 ; Alpha | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGB4444TOARGB | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q2} | |
+ bx lr | |
+ ENDP | |
+ | |
+ABGRToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_abgr | |
+ ; r1 = uint8* src_stride_abgr | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int witdh | |
+ vpush {q0 - q7} | |
+ vpush {q7 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_abgr | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ABGR pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ABGR pixels. | |
+ vpaddl.u8 q2, q2 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q0, q0 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ABGR pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ABGR pixels. | |
+ vpadal.u8 q2, q6 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q0, q4 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q2, q1, q0 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q7 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RGBAToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgba | |
+ ; r1 = uint8* src_stride_rgba | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q7 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_rgba | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 RGBA pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 RGBA pixels. | |
+ vpaddl.u8 q0, q1 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q2 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q3 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more RGBA pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 RGBA pixels. | |
+ vpadal.u8 q0, q5 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q6 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q7 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q0, q1, q2 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q7 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ABGRToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_abgr | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {d0 - d7} | |
+ vpush {q8} | |
+ | |
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d7, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ABGR. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q8, d0, d4 ; R | |
+ vmlal.u8 q8, d1, d5 ; G | |
+ vmlal.u8 q8, d2, d6 ; B | |
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d7 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q8} | |
+ vpop {d0 - d7} | |
+ bx lr | |
+ ENDP | |
+ | |
+RGBAToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgba | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {d0 - d7} | |
+ vpush {q8} | |
+ | |
+ vmov.u8 d4, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d6, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d7, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of RGBA. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q8, d1, d4 ; B | |
+ vmlal.u8 q8, d2, d5 ; G | |
+ vmlal.u8 q8, d3, d6 ; R | |
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d7 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q8} | |
+ vpop {d0 - d7} | |
+ bx lr | |
+ ENDP | |
+ | |
+RGB24ToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb24 | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {d0 - d7} | |
+ vpush {q8} | |
+ | |
+ vmov.u8 d4, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d6, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d7, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d0, d1, d2}, [r0]! ; load 8 pixels of RGB24. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q8, d0, d4 ; B | |
+ vmlal.u8 q8, d1, d5 ; G | |
+ vmlal.u8 q8, d2, d6 ; R | |
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d7 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q8} | |
+ vpop {d0 - d7} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | |
+ARGB1555ToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb1555 | |
+ ; r1 = uint8* src_stride_argb1555 | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_argb | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels. | |
+ RGB555TOARGB | |
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; next 8 ARGB1555 pixels. | |
+ RGB555TOARGB | |
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; load 8 ARGB1555 pixels. | |
+ RGB555TOARGB | |
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; next 8 ARGB1555 pixels. | |
+ RGB555TOARGB | |
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ vrshr.u16 q4, q4, #1 ; 2x average | |
+ vrshr.u16 q5, q5, #1 | |
+ vrshr.u16 q6, q6, #1 | |
+ | |
+ subs r4, r4, #16 ; 16 processed per loop. | |
+ vmul.s16 q8, q4, q10 ; B | |
+ vmls.s16 q8, q5, q11 ; G | |
+ vmls.s16 q8, q6, q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ vmul.s16 q9, q6, q10 ; R | |
+ vmls.s16 q9, q5, q14 ; G | |
+ vmls.s16 q9, q4, q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | |
+ARGB4444ToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb4444 | |
+ ; r1 = uint8* src_stride_argb4444 | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_argb | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels. | |
+ ARGB4444TOARGB | |
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; next 8 ARGB4444 pixels. | |
+ ARGB4444TOARGB | |
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; load 8 ARGB4444 pixels. | |
+ ARGB4444TOARGB | |
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; next 8 ARGB4444 pixels. | |
+ ARGB4444TOARGB | |
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ vrshr.u16 q4, q4, #1 ; 2x average | |
+ vrshr.u16 q5, q5, #1 | |
+ vrshr.u16 q6, q6, #1 | |
+ | |
+ subs r4, r4, #16 ; 16 processed per loop. | |
+ vmul.s16 q8, q4, q10 ; B | |
+ vmls.s16 q8, q5, q11 ; G | |
+ vmls.s16 q8, q6, q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ vmul.s16 q9, q6, q10 ; R | |
+ vmls.s16 q9, q5, q14 ; G | |
+ vmls.s16 q9, q4, q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RGB565ToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb565 | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d27, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ RGB565TOARGB | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d27 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. | |
+RGB565ToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb565 | |
+ ; r1 = uint8* src_stride_rgb565 | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_argb | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 RGB565 pixels. | |
+ RGB565TOARGB | |
+ vpaddl.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; next 8 RGB565 pixels. | |
+ RGB565TOARGB | |
+ vpaddl.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpaddl.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; load 8 RGB565 pixels. | |
+ RGB565TOARGB | |
+ vpadal.u8 d8, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d10, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d12, d2 ; R 8 bytes -> 4 shorts. | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! ; next 8 RGB565 pixels. | |
+ RGB565TOARGB | |
+ vpadal.u8 d9, d0 ; B 8 bytes -> 4 shorts. | |
+ vpadal.u8 d11, d1 ; G 8 bytes -> 4 shorts. | |
+ vpadal.u8 d13, d2 ; R 8 bytes -> 4 shorts. | |
+ | |
+ vrshr.u16 q4, q4, #1 ; 2x average | |
+ vrshr.u16 q5, q5, #1 | |
+ vrshr.u16 q6, q6, #1 | |
+ | |
+ subs r4, r4, #16 ; 16 processed per loop. | |
+ vmul.s16 q8, q4, q10 ; B | |
+ vmls.s16 q8, q5, q11 ; G | |
+ vmls.s16 q8, q6, q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ vmul.s16 q9, q6, q10 ; R | |
+ vmls.s16 q9, q5, q14 ; G | |
+ vmls.s16 q9, q4, q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGB1555ToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb1555 | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d27, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB1555 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGB1555TOARGB | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d27 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGB4444ToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb4444 | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d27, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 8 ARGB4444 pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ ARGB4444TOARGB | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d27 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+BGRAToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_bgra | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d7, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of BGRA. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q8, d1, d4 ; R | |
+ vmlal.u8 q8, d2, d5 ; G | |
+ vmlal.u8 q8, d3, d6 ; B | |
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d7 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. | |
+ARGBToUV411Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels. | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 0 | |
+ vld4.8 {d8, d10, d12, d14}, [r0]! ; load 8 more ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d9, d11, d13, d15}, [r0]! ; load last 8 ARGB pixels. | |
+ vpaddl.u8 q4, q4 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q5, q5 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q6, q6 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vpadd.u16 d0, d0, d1 ; B 16 shorts -> 8 shorts. | |
+ vpadd.u16 d1, d8, d9 ; B | |
+ vpadd.u16 d2, d2, d3 ; G 16 shorts -> 8 shorts. | |
+ vpadd.u16 d3, d10, d11 ; G | |
+ vpadd.u16 d4, d4, d5 ; R 16 shorts -> 8 shorts. | |
+ vpadd.u16 d5, d12, d13 ; R | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r3, r3, #32 ; 32 processed per loop. | |
+ vmul.s16 q8, q0, q10 ; B | |
+ vmls.s16 q8, q1, q11 ; G | |
+ vmls.s16 q8, q2, q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ vmul.s16 q9, q2, q10 ; R | |
+ vmls.s16 q9, q1, q14 ; G | |
+ vmls.s16 q9, q0, q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels U. | |
+ MEMACCESS 2 | |
+ vst1.8 {d1}, [r2]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToUV422Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels. | |
+ | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ | |
+ subs r3, r3, #16 ; 16 processed per loop. | |
+ vmul.s16 q8, q0, q10 ; B | |
+ vmls.s16 q8, q1, q11 ; G | |
+ vmls.s16 q8, q2, q12 ; R | |
+ vadd.u16 q8, q8, q15 ; +128 -> unsigned | |
+ | |
+ vmul.s16 q9, q2, q10 ; R | |
+ vmls.s16 q9, q1, q14 ; G | |
+ vmls.s16 q9, q0, q13 ; B | |
+ vadd.u16 q9, q9, q15 ; +128 -> unsigned | |
+ | |
+ vqshrn.u16 d0, q8, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q9, #8 ; 16 bit to 8 bit V | |
+ | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels U. | |
+ MEMACCESS 2 | |
+ vst1.8 {d1}, [r2]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; 8x1 pixels. | |
+ARGBToUV444Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int pix | |
+ vpush {q0 - q4} | |
+ vpush {q12 - q15} | |
+ | |
+ vmov.u8 d24, #112 ; UB / VR 0.875 coefficient | |
+ vmov.u8 d25, #74 ; UG -0.5781 coefficient | |
+ vmov.u8 d26, #38 ; UR -0.2969 coefficient | |
+ vmov.u8 d27, #18 ; VB -0.1406 coefficient | |
+ vmov.u8 d28, #94 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlsl.u8 q2, d1, d25 ; G | |
+ vmlsl.u8 q2, d2, d26 ; R | |
+ vadd.u16 q2, q2, q15 ; +128 -> unsigned | |
+ | |
+ vmull.u8 q3, d2, d24 ; R | |
+ vmlsl.u8 q3, d1, d28 ; G | |
+ vmlsl.u8 q3, d0, d27 ; B | |
+ vadd.u16 q3, q3, q15 ; +128 -> unsigned | |
+ | |
+ vqshrn.u16 d0, q2, #8 ; 16 bit to 8 bit U | |
+ vqshrn.u16 d1, q3, #8 ; 16 bit to 8 bit V | |
+ | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels U. | |
+ MEMACCESS 2 | |
+ vst1.8 {d1}, [r2]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q15} | |
+ vpop {q0 - q4} | |
+ bx lr | |
+ ENDP | |
+ | |
+YUY2ToUV422Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_yuy2 | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int pix | |
+ vpush {d0 - d3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of YUY2. | |
+ subs r3, r3, #16 ; 16 pixels = 8 UVs. | |
+ MEMACCESS 1 | |
+ vst1.8 {d1}, [r1]! ; store 8 U. | |
+ MEMACCESS 2 | |
+ vst1.8 {d3}, [r2]! ; store 8 V. | |
+ bgt %b1 | |
+ | |
+ vpop {d0 - d3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+UYVYToUV422Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uyvy | |
+ ; r1 = uint8* dst_u | |
+ ; r2 = uint8* dst_v | |
+ ; r3 = int pix | |
+ vpush {d0 - d3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of UYVY. | |
+ subs r3, r3, #16 ; 16 pixels = 8 UVs. | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 U. | |
+ MEMACCESS 2 | |
+ vst1.8 {d2}, [r2]! ; store 8 V. | |
+ bgt %b1 | |
+ | |
+ vpop {d0 - d3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; Select G channels from ARGB. e.g. GGGGGGGG | |
+ARGBToBayerGGRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_bayer | |
+ ; r2 = uint32 selector | |
+ ; r3 = int pix | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load row 8 pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {d1}, [r1]! ; store 8 G's. | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
+ARGBShuffleRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = const uint8* shuffler | |
+ ; r3 = int pix | |
+ vpush {q0 - q2} | |
+ | |
+ MEMACCESS 3 | |
+ vld1.8 {q2}, [r2] ; shuffler | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 4 pixels. | |
+ subs r3, r3, #4 ; 4 processed per loop | |
+ vtbl.8 d2, {d0, d1}, d4 ; look up 2 first pixels | |
+ vtbl.8 d3, {d0, d1}, d5 ; look up 2 next pixels | |
+ MEMACCESS 1 | |
+ vst1.8 {q1}, [r1]! ; store 4. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q2} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; TODO(fbarchard): Subsample match C code. | |
+ARGBToUVJRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = int src_stride_argb | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_argb | |
+ vmov.s16 q10, #127 / 2 ; UB / VR 0.500 coefficient | |
+ vmov.s16 q11, #84 / 2 ; UG -0.33126 coefficient | |
+ vmov.s16 q12, #43 / 2 ; UR -0.16874 coefficient | |
+ vmov.s16 q13, #20 / 2 ; VB -0.08131 coefficient | |
+ vmov.s16 q14, #107 / 2 ; VG -0.41869 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels. | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ARGB pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ARGB pixels. | |
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q0, q1, q2 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+BGRAToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_bgra | |
+ ; r1 = int src_stride_bgra | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_bgra | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 BGRA pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 BGRA pixels. | |
+ vpaddl.u8 q3, q3 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more BGRA pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 BGRA pixels. | |
+ vpadal.u8 q3, q7 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q6 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q1, q1, #1 ; 2x average | |
+ vrshr.u16 q2, q2, #1 | |
+ vrshr.u16 q3, q3, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q3, q2, q1 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBExtractAlphaRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels | |
+ subs r2, r2, #16 ; 16 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {q3}, [r1]! ; store 16 A's. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToYJRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0 - q2} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d24, #15 ; B * 0.11400 coefficient | |
+ vmov.u8 d25, #75 ; G * 0.58700 coefficient | |
+ vmov.u8 d26, #38 ; R * 0.29900 coefficient | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 15 bit to 8 bit Y | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q2} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToYUY2Row_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_yuy2 | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int width | |
+ vpush {d0 - d3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d0, d2}, [r0]! ; load 16 Ys | |
+ MEMACCESS 1 | |
+ vld1.8 {d1}, [r1]! ; load 8 Us | |
+ MEMACCESS 2 | |
+ vld1.8 {d3}, [r2]! ; load 8 Vs | |
+ subs r4, r4, #16 ; 16 pixels | |
+ MEMACCESS 3 | |
+ vst4.8 {d0, d1, d2, d3}, [r3]! ; Store 8 YUY2/16 pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {d0 - d3} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+I422ToUYVYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y | |
+ ; r1 = const uint8* src_u | |
+ ; r2 = const uint8* src_v | |
+ ; r3 = uint8* dst_uyvy | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int width | |
+ vpush {d0 - d3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {d1, d3}, [r0]! ; load 16 Ys | |
+ MEMACCESS 1 | |
+ vld1.8 {d0}, [r1]! ; load 8 Us | |
+ MEMACCESS 2 | |
+ vld1.8 {d2}, [r2]! ; load 8 Vs | |
+ subs r4, r4, #16 ; 16 pixels | |
+ MEMACCESS 3 | |
+ vst4.8 {d0, d1, d2, d3}, [r3]! ; Store 8 UYVY/16 pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {d0 - d3} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, ashr. | |
+ARGBToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = int src_stride_argb | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_argb | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels. | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d8, d10, d12, d14}, [r1]! ; load 8 more ARGB pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d9, d11, d13, d15}, [r1]! ; load last 8 ARGB pixels. | |
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q0, q1, q2 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_bgra | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0-q2} | |
+ vpush {q12,q13} | |
+ | |
+ vmov.u8 d24, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d25, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d26, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d27, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d27 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q12, q13} | |
+ vpop {q0-q2} | |
+ bx lr | |
+ ENDP | |
+ | |
+RAWToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_raw | |
+ ; r1 = int src_stride_raw | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_raw | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d0, d2, d4}, [r0]! ; load 8 RAW pixels. | |
+ MEMACCESS 0 | |
+ vld3.8 {d1, d3, d5}, [r0]! ; load next 8 RAW pixels. | |
+ vpaddl.u8 q2, q2 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q0, q0 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld3.8 {d8, d10, d12}, [r1]! ; load 8 more RAW pixels. | |
+ MEMACCESS 1 | |
+ vld3.8 {d9, d11, d13}, [r1]! ; load last 8 RAW pixels. | |
+ vpadal.u8 q2, q6 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q0, q4 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q2, q1, q0 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+RAWToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_raw | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {d0 - d7} | |
+ vpush {q8} | |
+ | |
+ vmov.u8 d4, #33 ; R * 0.2578 coefficient | |
+ vmov.u8 d5, #65 ; G * 0.5078 coefficient | |
+ vmov.u8 d6, #13 ; B * 0.1016 coefficient | |
+ vmov.u8 d7, #16 ; Add 16 constant | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d0, d1, d2}, [r0]! ; load 8 pixels of RAW. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q8, d0, d4 ; B | |
+ vmlal.u8 q8, d1, d5 ; G | |
+ vmlal.u8 q8, d2, d6 ; R | |
+ vqrshrun.s16 d0, q8, #7 ; 16 bit to 8 bit Y | |
+ vqadd.u8 d0, d7 | |
+ MEMACCESS 1 | |
+ vst1.8 {d0}, [r1]! ; store 8 pixels Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q8} | |
+ vpop {d0-d7} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+RGB24ToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_rgb24 | |
+ ; r1 = int src_stride_rgb24 | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; src_stride + src_rgb24 | |
+ vmov.s16 q10, #112 / 2 ; UB / VR 0.875 coefficient | |
+ vmov.s16 q11, #74 / 2 ; UG -0.5781 coefficient | |
+ vmov.s16 q12, #38 / 2 ; UR -0.2969 coefficient | |
+ vmov.s16 q13, #18 / 2 ; VB -0.1406 coefficient | |
+ vmov.s16 q14, #94 / 2 ; VG -0.7344 coefficient | |
+ vmov.u16 q15, #0x8080 ; 128.5 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld3.8 {d0, d2, d4}, [r0]! ; load 8 RGB24 pixels. | |
+ MEMACCESS 0 | |
+ vld3.8 {d1, d3, d5}, [r0]! ; load next 8 RGB24 pixels. | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld3.8 {d8, d10, d12}, [r1]! ; load 8 more RGB24 pixels. | |
+ MEMACCESS 1 | |
+ vld3.8 {d9, d11, d13}, [r1]! ; load last 8 RGB24 pixels. | |
+ vpadal.u8 q0, q4 ; B 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q5 ; G 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q6 ; R 16 bytes -> 8 shorts. | |
+ | |
+ vrshr.u16 q0, q0, #1 ; 2x average | |
+ vrshr.u16 q1, q1, #1 | |
+ vrshr.u16 q2, q2, #1 | |
+ | |
+ subs r4, r4, #16 ; 32 processed per loop. | |
+ RGBTOUV q0, q1, q2 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 pixels U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d1}, [r3]! ; store 8 pixels V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+UYVYToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uyvy | |
+ ; r1 = int stride_uyvy | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ add r1, r0, r1 ; stride + src_uyvy | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of UYVY. | |
+ subs r4, r4, #16 ; 16 pixels = 8 UVs. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load next row UYVY. | |
+ vrhadd.u8 d0, d0, d4 ; average rows of U | |
+ vrhadd.u8 d2, d2, d6 ; average rows of V | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d2}, [r3]! ; store 8 V. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+UYVYToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_uyvy | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {q0, q1}, [r0]! ; load 16 pixels of UYVY. | |
+ subs r2, r2, #16 ; 16 processed per loop. | |
+ MEMACCESS 1 | |
+ vst1.8 {q1}, [r1]! ; store 16 pixels of Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+YUY2ToUVRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_yuy2 | |
+ ; r1 = int stride_yuy2 | |
+ ; r2 = uint8* dst_u | |
+ ; r3 = uint8* dst_v | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int pix | |
+ vpush {d0 - d7} | |
+ | |
+ add r1, r0, r1 ; stride + src_yuy2 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 16 pixels of YUY2. | |
+ subs r4, r4, #16 ; 16 pixels = 8 UVs. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load next row YUY2. | |
+ vrhadd.u8 d1, d1, d5 ; average rows of U | |
+ vrhadd.u8 d3, d3, d7 ; average rows of V | |
+ MEMACCESS 2 | |
+ vst1.8 {d1}, [r2]! ; store 8 U. | |
+ MEMACCESS 3 | |
+ vst1.8 {d3}, [r3]! ; store 8 V. | |
+ bgt %b1 | |
+ | |
+ vpop {d0 - d7} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+YUY2ToYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_yuy2 | |
+ ; r1 = uint8* dst_y | |
+ ; r2 = int pix | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld2.8 {q0, q1}, [r0]! ; load 16 pixels of YUY2. | |
+ subs r2, r2, #16 ; 16 processed per loop. | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r1]! ; store 16 pixels of Y. | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+ARGBToRGB565DitherRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_rgb | |
+ ; r2 = const uint32 dither4 | |
+ ; r3 = int width | |
+ vpush {q0, q1} | |
+ vpush {q8-q11} | |
+ | |
+ vdup.32 d2, r2 ; dither4 | |
+1 | |
+ MEMACCESS 1 | |
+ vld4.8 {d20, d21, d22, d23}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vqadd.u8 d20, d20, d2 | |
+ vqadd.u8 d21, d21, d2 | |
+ vqadd.u8 d22, d22, d2 | |
+ ARGBTORGB565 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r1]! ; store 8 pixels RGB565. | |
+ bgt %b1 | |
+ | |
+ vpop {q8-q11} | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Add 2 rows of ARGB pixels together, 8 pixels at a time. | |
+ARGBAddRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb0 | |
+ ; r1 = uint8* src_argb1 | |
+ ; r2 = uint8* dst_arg | |
+ ; r3 = int width | |
+ vpush {q0 - q3} | |
+ ; 8 pixel loop. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 more ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vqadd.u8 q0, q0, q2 ; add B, G | |
+ vqadd.u8 q1, q1, q3 ; add R, A | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Subtract 2 rows of ARGB pixels, 8 pixels at a time. | |
+ARGBSubtractRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb0 | |
+ ; r1 = uint8* src_argb1 | |
+ ; r2 = uint8* dst_arg | |
+ ; r3 = int width | |
+ vpush {q0 - q3} | |
+ ; 8 pixel loop. | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 more ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vqsub.u8 q0, q0, q2 ; subtract B, G | |
+ vqsub.u8 q1, q1, q3 ; subtract R, A | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Adds Sobel X and Sobel Y and stores Sobel into ARGB. | |
+; A = 255 | |
+; R = Sobel | |
+; G = Sobel | |
+; B = Sobel | |
+SobelRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_sobelx | |
+ ; r1 = const uint8* src_sobely | |
+ ; r2 = uint8* dst_argb | |
+ ; r3 = int width | |
+ vpush {q0 - q1} | |
+ vmov.u8 d3, #255 ; alpha | |
+ ; 8 pixel loop. | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0]! ; load 8 sobelx. | |
+ MEMACCESS 1 | |
+ vld1.8 {d1}, [r1]! ; load 8 sobely. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vqadd.u8 d0, d0, d1 ; add | |
+ vmov.u8 d1, d0 | |
+ vmov.u8 d2, d0 | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Adds Sobel X and Sobel Y and stores Sobel into plane. | |
+SobelToPlaneRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_sobelx | |
+ ; r1 = const uint8* src_sobely | |
+ ; r2 = uint8* dst_y | |
+ ; r3 = int width | |
+ vpush {q0 - q1} | |
+ ; 16 pixel loop. | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load 16 sobelx. | |
+ MEMACCESS 1 | |
+ vld1.8 {q1}, [r1]! ; load 16 sobely. | |
+ subs r3, r3, #16 ; 16 processed per loop. | |
+ vqadd.u8 q0, q0, q1 ; add | |
+ MEMACCESS 2 | |
+ vst1.8 {q0}, [r2]! ; store 16 pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Attenuate 8 pixels at a time. | |
+ARGBAttenuateRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = const uint8* dst_argb | |
+ ; r2 = int width | |
+ vpush {q0 - q1} | |
+ vpush {q10 - q12} | |
+ | |
+ ; Attenuate 8 pixels. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q10, d0, d3 ; b * a | |
+ vmull.u8 q11, d1, d3 ; g * a | |
+ vmull.u8 q12, d2, d3 ; r * a | |
+ vqrshrn.u16 d0, q10, #8 ; b >>= 8 | |
+ vqrshrn.u16 d1, q11, #8 ; g >>= 8 | |
+ vqrshrn.u16 d2, q12, #8 ; r >>= 8 | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q10 - q12} | |
+ vpop {q0 - q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Quantize 8 ARGB pixels (32 bytes). | |
+; dst = (dst * scale >> 16) * interval_size + interval_offset; | |
+ARGBQuantizeRow_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* dst_argb | |
+ ; r1 = int scale | |
+ ; r2 = int interval_size | |
+ ; r3 = int interval_offset | |
+ push {r2 - r4} | |
+ ldr r4, [sp,#12] ; int width | |
+ vpush {q0 - q3} | |
+ vpush {q8 - q10} | |
+ | |
+ vdup.u16 q8, r1 | |
+ vshr.u16 q8, q8, #1 ; scale >>= 1 | |
+ vdup.u16 q9, r2 ; interval multiply. | |
+ vdup.u16 q10, r3 ; interval add | |
+ | |
+ ; 8 pixel loop. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0] ; load 8 pixels of ARGB. | |
+ subs r4, r4, #8 ; 8 processed per loop. | |
+ vmovl.u8 q0, d0 ; b (0 .. 255) | |
+ vmovl.u8 q1, d2 | |
+ vmovl.u8 q2, d4 | |
+ vqdmulh.s16 q0, q0, q8 ; b * scale | |
+ vqdmulh.s16 q1, q1, q8 ; g | |
+ vqdmulh.s16 q2, q2, q8 ; r | |
+ vmul.u16 q0, q0, q9 ; b * interval_size | |
+ vmul.u16 q1, q1, q9 ; g | |
+ vmul.u16 q2, q2, q9 ; r | |
+ vadd.u16 q0, q0, q10 ; b + interval_offset | |
+ vadd.u16 q1, q1, q10 ; g | |
+ vadd.u16 q2, q2, q10 ; r | |
+ vqmovn.u16 d0, q0 | |
+ vqmovn.u16 d2, q1 | |
+ vqmovn.u16 d4, q2 | |
+ MEMACCESS 0 | |
+ vst4.8 {d0, d2, d4, d6}, [r0]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q10} | |
+ vpop {q0 - q3} | |
+ pop {r2 - r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Shade 8 pixels at a time by specified value. | |
+; NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. | |
+; Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. | |
+ARGBShadeRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int width | |
+ ; r3 = int value | |
+ vpush {q0} | |
+ vpush {q10 - q13} | |
+ | |
+ vdup.u32 q0, r3 ; duplicate scale value. | |
+ vzip.u8 d0, d1 ; d0 aarrggbb. | |
+ vshr.u16 q0, q0, #1 ; scale / 2. | |
+ | |
+ ; 8 pixel loop. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d20, d22, d24, d26}, [r0]! ; load 8 pixels of ARGB. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmovl.u8 q10, d20 ; b (0 .. 255) | |
+ vmovl.u8 q11, d22 | |
+ vmovl.u8 q12, d24 | |
+ vmovl.u8 q13, d26 | |
+ vqrdmulh.s16 q10, q10, d0[0] ; b * scale * 2 | |
+ vqrdmulh.s16 q11, q11, d0[1] ; g | |
+ vqrdmulh.s16 q12, q12, d0[2] ; r | |
+ vqrdmulh.s16 q13, q13, d0[3] ; a | |
+ vqmovn.u16 d20, q10 | |
+ vqmovn.u16 d22, q11 | |
+ vqmovn.u16 d24, q12 | |
+ vqmovn.u16 d26, q13 | |
+ MEMACCESS 1 | |
+ vst4.8 {d20, d22, d24, d26}, [r1]! ; store 8 pixels of ARGB. | |
+ bgt %b1 | |
+ | |
+ vpop {q10 - q13} | |
+ vpop {q0} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | |
+; Similar to ARGBToYJ but stores ARGB. | |
+; C code is (15 * b + 75 * g + 38 * r + 64) >> 7; | |
+ARGBGrayRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = int width | |
+ vpush {q0- q2} | |
+ vpush {q12 - q13} | |
+ | |
+ vmov.u8 d24, #15 ; B * 0.11400 coefficient | |
+ vmov.u8 d25, #75 ; G * 0.58700 coefficient | |
+ vmov.u8 d26, #38 ; R * 0.29900 coefficient | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 ARGB pixels. | |
+ subs r2, r2, #8 ; 8 processed per loop. | |
+ vmull.u8 q2, d0, d24 ; B | |
+ vmlal.u8 q2, d1, d25 ; G | |
+ vmlal.u8 q2, d2, d26 ; R | |
+ vqrshrun.s16 d0, q2, #7 ; 15 bit to 8 bit B | |
+ vmov d1, d0 ; G | |
+ vmov d2, d0 ; R | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r1]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q12 - q13} | |
+ vpop {q0 - q2} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | |
+; b = (r * 35 + g * 68 + b * 17) >> 7 | |
+; g = (r * 45 + g * 88 + b * 22) >> 7 | |
+; r = (r * 50 + g * 98 + b * 24) >> 7 | |
+ARGBSepiaRow_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* dst_argb | |
+ ; r1 = int width | |
+ vpush {q0- q3} | |
+ vpush {q10 - q15} | |
+ | |
+ vmov.u8 d20, #17 ; BB coefficient | |
+ vmov.u8 d21, #68 ; BG coefficient | |
+ vmov.u8 d22, #35 ; BR coefficient | |
+ vmov.u8 d24, #22 ; GB coefficient | |
+ vmov.u8 d25, #88 ; GG coefficient | |
+ vmov.u8 d26, #45 ; GR coefficient | |
+ vmov.u8 d28, #24 ; BB coefficient | |
+ vmov.u8 d29, #98 ; BG coefficient | |
+ vmov.u8 d30, #50 ; BR coefficient | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0] ; load 8 ARGB pixels. | |
+ subs r1, r1, #8 ; 8 processed per loop. | |
+ vmull.u8 q2, d0, d20 ; B to Sepia B | |
+ vmlal.u8 q2, d1, d21 ; G | |
+ vmlal.u8 q2, d2, d22 ; R | |
+ vmull.u8 q3, d0, d24 ; B to Sepia G | |
+ vmlal.u8 q3, d1, d25 ; G | |
+ vmlal.u8 q3, d2, d26 ; R | |
+ vmull.u8 q8, d0, d28 ; B to Sepia R | |
+ vmlal.u8 q8, d1, d29 ; G | |
+ vmlal.u8 q8, d2, d30 ; R | |
+ vqshrn.u16 d0, q2, #7 ; 16 bit to 8 bit B | |
+ vqshrn.u16 d1, q3, #7 ; 16 bit to 8 bit G | |
+ vqshrn.u16 d2, q8, #7 ; 16 bit to 8 bit R | |
+ MEMACCESS 0 | |
+ vst4.8 {d0, d1, d2, d3}, [r0]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q10 - q15} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Tranform 8 ARGB pixels (32 bytes) with color matrix. | |
+; TODO(fbarchard): Was same as Sepia except matrix is provided. This function | |
+; needs to saturate. Consider doing a non-saturating version. | |
+ARGBColorMatrixRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb | |
+ ; r1 = uint8* dst_argb | |
+ ; r2 = const int8* matrix_argb | |
+ ; r3 = int width | |
+ vpush {q0 - q7} | |
+ vpush {q8 - q14} | |
+ vpush {q15} | |
+ | |
+ MEMACCESS 3 | |
+ vld1.8 {q2}, [r2] ; load 3 ARGB vectors. | |
+ vmovl.s8 q0, d4 ; B,G coefficients s16. | |
+ vmovl.s8 q1, d5 ; R,A coefficients s16. | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d16, d18, d20, d22}, [r0]! ; load 8 ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vmovl.u8 q8, d16 ; b (0 .. 255) 16 bit | |
+ vmovl.u8 q9, d18 ; g | |
+ vmovl.u8 q10, d20 ; r | |
+ vmovl.u8 q11, d22 ; a | |
+ vmul.s16 q12, q8, d0[0] ; B = B * Matrix B | |
+ vmul.s16 q13, q8, d1[0] ; G = B * Matrix G | |
+ vmul.s16 q14, q8, d2[0] ; R = B * Matrix R | |
+ vmul.s16 q15, q8, d3[0] ; A = B * Matrix A | |
+ vmul.s16 q4, q9, d0[1] ; B += G * Matrix B | |
+ vmul.s16 q5, q9, d1[1] ; G += G * Matrix G | |
+ vmul.s16 q6, q9, d2[1] ; R += G * Matrix R | |
+ vmul.s16 q7, q9, d3[1] ; A += G * Matrix A | |
+ vqadd.s16 q12, q12, q4 ; Accumulate B | |
+ vqadd.s16 q13, q13, q5 ; Accumulate G | |
+ vqadd.s16 q14, q14, q6 ; Accumulate R | |
+ vqadd.s16 q15, q15, q7 ; Accumulate A | |
+ vmul.s16 q4, q10, d0[2] ; B += R * Matrix B | |
+ vmul.s16 q5, q10, d1[2] ; G += R * Matrix G | |
+ vmul.s16 q6, q10, d2[2] ; R += R * Matrix R | |
+ vmul.s16 q7, q10, d3[2] ; A += R * Matrix A | |
+ vqadd.s16 q12, q12, q4 ; Accumulate B | |
+ vqadd.s16 q13, q13, q5 ; Accumulate G | |
+ vqadd.s16 q14, q14, q6 ; Accumulate R | |
+ vqadd.s16 q15, q15, q7 ; Accumulate A | |
+ vmul.s16 q4, q11, d0[3] ; B += A * Matrix B | |
+ vmul.s16 q5, q11, d1[3] ; G += A * Matrix G | |
+ vmul.s16 q6, q11, d2[3] ; R += A * Matrix R | |
+ vmul.s16 q7, q11, d3[3] ; A += A * Matrix A | |
+ vqadd.s16 q12, q12, q4 ; Accumulate B | |
+ vqadd.s16 q13, q13, q5 ; Accumulate G | |
+ vqadd.s16 q14, q14, q6 ; Accumulate R | |
+ vqadd.s16 q15, q15, q7 ; Accumulate A | |
+ vqshrun.s16 d16, q12, #6 ; 16 bit to 8 bit B | |
+ vqshrun.s16 d18, q13, #6 ; 16 bit to 8 bit G | |
+ vqshrun.s16 d20, q14, #6 ; 16 bit to 8 bit R | |
+ vqshrun.s16 d22, q15, #6 ; 16 bit to 8 bit A | |
+ MEMACCESS 1 | |
+ vst4.8 {d16, d18, d20, d22}, [r1]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q15} | |
+ vpop {q8 - q14} | |
+ vpop {q0 - q7} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr | |
+ARGBBlendRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb0 | |
+ ; r1 = const uint8* src_argb1 | |
+ ; r2 = int8* dst_argb | |
+ ; r3 = int width | |
+ vpush {q0 - q3} | |
+ vpush {q10 - q12} | |
+ | |
+ subs r3, #8 | |
+ blt %f89 | |
+ ; Blend 8 pixels. | |
+8 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; load 8 pixels of ARGB0. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; load 8 pixels of ARGB1. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vmull.u8 q10, d4, d3 ; db * a | |
+ vmull.u8 q11, d5, d3 ; dg * a | |
+ vmull.u8 q12, d6, d3 ; dr * a | |
+ vqrshrn.u16 d20, q10, #8 ; db >>= 8 | |
+ vqrshrn.u16 d21, q11, #8 ; dg >>= 8 | |
+ vqrshrn.u16 d22, q12, #8 ; dr >>= 8 | |
+ vqsub.u8 q2, q2, q10 ; dbg - dbg * a / 256 | |
+ vqsub.u8 d6, d6, d22 ; dr - dr * a / 256 | |
+ vqadd.u8 q0, q0, q2 ; + sbg | |
+ vqadd.u8 d2, d2, d6 ; + sr | |
+ vmov.u8 d3, #255 ; a = 255 | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 pixels of ARGB. | |
+ bge %b8 | |
+ | |
+89 | |
+ adds r3, #8-1 | |
+ blt %f99 | |
+ | |
+ ; Blend 1 pixels. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! ; load 1 pixel ARGB0. | |
+ MEMACCESS 1 | |
+ vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [r1]! ; load 1 pixel ARGB1. | |
+ subs r3, r3, #1 ; 1 processed per loop. | |
+ vmull.u8 q10, d4, d3 ; db * a | |
+ vmull.u8 q11, d5, d3 ; dg * a | |
+ vmull.u8 q12, d6, d3 ; dr * a | |
+ vqrshrn.u16 d20, q10, #8 ; db >>= 8 | |
+ vqrshrn.u16 d21, q11, #8 ; dg >>= 8 | |
+ vqrshrn.u16 d22, q12, #8 ; dr >>= 8 | |
+ vqsub.u8 q2, q2, q10 ; dbg - dbg * a / 256 | |
+ vqsub.u8 d6, d6, d22 ; dr - dr * a / 256 | |
+ vqadd.u8 q0, q0, q2 ; + sbg | |
+ vqadd.u8 d2, d2, d6 ; + sr | |
+ vmov.u8 d3, #255 ; a = 255 | |
+ MEMACCESS 2 | |
+ vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r2]! ; store 1 pixel. | |
+ bge %b1 | |
+ | |
+99 | |
+ | |
+ vpop {q10 - q12} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Bilinear filter 16x2 -> 16x1 | |
+InterpolateRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* dst_ptr | |
+ ; r1 = const uint8* src_ptr | |
+ ; r2 = int8* dst_argb | |
+ ; r3 = int width | |
+ push {r4} | |
+ ldr r4, [sp,#4] ; int width | |
+ vpush {q0 - q1} | |
+ vpush {d4 - d5} | |
+ vpush {q13 - q14} | |
+ | |
+ cmp r4, #0 | |
+ beq %f100 | |
+ add r2, r1 | |
+ cmp r4, #64 | |
+ beq %f75 | |
+ cmp r4, #128 | |
+ beq %f50 | |
+ cmp r4, #192 | |
+ beq %f25 | |
+ | |
+ vdup.8 d5, r4 | |
+ rsb r4, #256 | |
+ vdup.8 d4, r4 | |
+ ; General purpose row blend. | |
+1 | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.8 {q1}, [r2]! | |
+ subs r3, r3, #16 | |
+ vmull.u8 q13, d0, d4 | |
+ vmull.u8 q14, d1, d4 | |
+ vmlal.u8 q13, d2, d5 | |
+ vmlal.u8 q14, d3, d5 | |
+ vrshrn.u16 d0, q13, #8 | |
+ vrshrn.u16 d1, q14, #8 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! | |
+ bgt %b1 | |
+ b %f99 | |
+ | |
+ ; Blend 25 / 75. | |
+25 | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.8 {q1}, [r2]! | |
+ subs r3, r3, #16 | |
+ vrhadd.u8 q0, q1 | |
+ vrhadd.u8 q0, q1 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! | |
+ bgt %b25 | |
+ b %f99 | |
+ | |
+ ; Blend 50 / 50. | |
+50 | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.8 {q1}, [r2]! | |
+ subs r3, r3, #16 | |
+ vrhadd.u8 q0, q1 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! | |
+ bgt %b50 | |
+ b %f99 | |
+ | |
+ ; Blend 75 / 25. | |
+75 | |
+ MEMACCESS 1 | |
+ vld1.8 {q1}, [r1]! | |
+ MEMACCESS 2 | |
+ vld1.8 {q0}, [r2]! | |
+ subs r3, r3, #16 | |
+ vrhadd.u8 q0, q1 | |
+ vrhadd.u8 q0, q1 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! | |
+ bgt %b75 | |
+ b %f99 | |
+ | |
+ ; Blend 100 / 0 - Copy row unchanged. | |
+100 | |
+ MEMACCESS 1 | |
+ vld1.8 {q0}, [r1]! | |
+ subs r3, r3, #16 | |
+ MEMACCESS 0 | |
+ vst1.8 {q0}, [r0]! | |
+ bgt %b100 | |
+ | |
+99 | |
+ | |
+ vpop {q13 - q14} | |
+ vpop {d4 - d5} | |
+ vpop {q0 - q1} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | |
+ARGBMultiplyRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_argb0 | |
+ ; r1 = const uint8* src_argb1 | |
+ ; r2 = int8* dst_argb | |
+ ; r3 = int width | |
+ vpush {q0 - q3} | |
+ | |
+ ; 8 pixel loop. | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d1, d3, d5, d7}, [r1]! ; load 8 more ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vmull.u8 q0, d0, d1 ; multiply B | |
+ vmull.u8 q1, d2, d3 ; multiply G | |
+ vmull.u8 q2, d4, d5 ; multiply R | |
+ vmull.u8 q3, d6, d7 ; multiply A | |
+ vrshrn.u16 d0, q0, #8 ; 16 bit to 8 bit B | |
+ vrshrn.u16 d1, q1, #8 ; 16 bit to 8 bit G | |
+ vrshrn.u16 d2, q2, #8 ; 16 bit to 8 bit R | |
+ vrshrn.u16 d3, q3, #8 ; 16 bit to 8 bit A | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+; SobelX as a matrix is | |
+; -1 0 1 | |
+; -2 0 2 | |
+; -1 0 1 | |
+SobelXRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y0 | |
+ ; r1 = const uint8* src_y1 | |
+ ; r2 = const uint8* src_y2 | |
+ ; r3 = uint8* dst_sobelx | |
+ push {r4 - r6} | |
+ ldr r4, [sp,#12] ; int width | |
+ vpush {q0 - q1} | |
+ mov r5, 2 | |
+ mov r6, 6 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0],r5 ; top | |
+ MEMACCESS 0 | |
+ vld1.8 {d1}, [r0],r6 | |
+ vsubl.u8 q0, d0, d1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d2}, [r1],r5 ; center * 2 | |
+ MEMACCESS 1 | |
+ vld1.8 {d3}, [r1],r6 | |
+ vsubl.u8 q1, d2, d3 | |
+ vadd.s16 q0, q0, q1 | |
+ vadd.s16 q0, q0, q1 | |
+ MEMACCESS 2 | |
+ vld1.8 {d2}, [r2],r5 ; bottom | |
+ MEMACCESS 2 | |
+ vld1.8 {d3}, [r2],r6 | |
+ subs r4, r4, #8 ; 8 pixels | |
+ vsubl.u8 q1, d2, d3 | |
+ vadd.s16 q0, q0, q1 | |
+ vabs.s16 q0, q0 | |
+ vqmovn.u16 d0, q0 | |
+ MEMACCESS 3 | |
+ vst1.8 {d0}, [r3]! ; store 8 sobelx | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q1} | |
+ pop {r4 - r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+; SobelY as a matrix is | |
+; -1 -2 -1 | |
+; 0 0 0 | |
+; 1 2 1 | |
+SobelYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_y0 | |
+ ; r1 = const uint8* src_y1 | |
+ ; r2 = uint8* dst_sobely | |
+ ; r3 = int width | |
+ vpush {q0 - q1} | |
+ push {r4 - r5} | |
+ mov r4, 1 | |
+ mov r5, 6 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0],r4 ; left | |
+ MEMACCESS 1 | |
+ vld1.8 {d1}, [r1],r4 | |
+ vsubl.u8 q0, d0, d1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d2}, [r0],r4 ; center * 2 | |
+ MEMACCESS 1 | |
+ vld1.8 {d3}, [r1],r4 | |
+ vsubl.u8 q1, d2, d3 | |
+ vadd.s16 q0, q0, q1 | |
+ vadd.s16 q0, q0, q1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d2}, [r0],r5 ; right | |
+ MEMACCESS 1 | |
+ vld1.8 {d3}, [r1],r5 | |
+ subs r3, r3, #8 ; 8 pixels | |
+ vsubl.u8 q1, d2, d3 | |
+ vadd.s16 q0, q0, q1 | |
+ vabs.s16 q0, q0 | |
+ vqmovn.u16 d0, q0 | |
+ MEMACCESS 2 | |
+ vst1.8 {d0}, [r2]! ; store 8 sobely | |
+ bgt %b1 | |
+ | |
+ pop {r4 - r5} | |
+ vpop {q0 - q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+; Mixes Sobel X, Sobel Y and Sobel into ARGB. | |
+; A = 255 | |
+; R = Sobel X | |
+; G = Sobel | |
+; B = Sobel Y | |
+SobelXYRow_NEON PROC | |
+ ; input | |
+ ; r0 = const uint8* src_sobelx | |
+ ; r1 = const uint8* src_sobely | |
+ ; r2 = uint8* dst_argb | |
+ ; r3 = int width | |
+ vpush {q0 - q1} | |
+ | |
+ vmov.u8 d3, #255 ; alpha | |
+ ; 8 pixel loop. | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d2}, [r0]! ; load 8 sobelx. | |
+ MEMACCESS 1 | |
+ vld1.8 {d0}, [r1]! ; load 8 sobely. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vqadd.u8 d1, d0, d2 ; add | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! ; store 8 ARGB pixels. | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+ END | |
+ | |
+ | |
diff --git a/source/row_neon.cc b/source/row_neon.cc | |
index bed14e0..3450302 100644 | |
--- a/source/row_neon.cc | |
+++ b/source/row_neon.cc | |
@@ -17,6 +17,15 @@ namespace libyuv { | |
extern "C" { | |
#endif | |
+ /* !!! IMPORTANT: Following methods has been ported to pure assembler to row_neon.asm, | |
+ * because MS Visual Studio doesn't support inline assembler for ARM. | |
+ * | |
+ * ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN row_neon.asm | |
+ * | |
+ * Eventually, only pure assembler implementation should be used for all platforms | |
+ * to avoid code duplication. | |
+ */ | |
+ | |
// This module is for GCC Neon | |
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ | |
!defined(__aarch64__) | |
diff --git a/source/scale_neon.asm b/source/scale_neon.asm | |
new file mode 100644 | |
index 0000000..8ede0ec | |
--- /dev/null | |
+++ b/source/scale_neon.asm | |
@@ -0,0 +1,970 @@ | |
+; | |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. | |
+; | |
+; Use of this source code is governed by a BSD-style license | |
+; that can be found in the LICENSE file in the root of the source | |
+; tree. An additional intellectual property rights grant can be found | |
+; in the file PATENTS. All contributing project authors may | |
+; be found in the AUTHORS file in the root of the source tree. | |
+; | |
+ | |
+ AREA |.text|, CODE, READONLY, ALIGN=2 | |
+ | |
+ GET source/arm_asm_macros.in | |
+ | |
+ EXPORT ScaleRowDown2_NEON | |
+ EXPORT ScaleRowDown2Linear_NEON | |
+ EXPORT ScaleRowDown2Box_NEON | |
+ EXPORT ScaleRowDown4_NEON | |
+ EXPORT ScaleRowDown4Box_NEON | |
+ EXPORT ScaleRowDown34_NEON | |
+ EXPORT ScaleRowDown34_0_Box_NEON | |
+ EXPORT ScaleRowDown34_1_Box_NEON | |
+ EXPORT ScaleRowDown38_NEON | |
+ EXPORT ScaleRowDown38_3_Box_NEON | |
+ EXPORT ScaleRowDown38_2_Box_NEON | |
+ EXPORT ScaleAddRows_NEON | |
+ EXPORT ScaleFilterCols_NEON | |
+ EXPORT ScaleARGBRowDown2_NEON | |
+ EXPORT ScaleARGBRowDown2Linear_NEON | |
+ EXPORT ScaleARGBRowDown2Box_NEON | |
+ EXPORT ScaleARGBRowDownEven_NEON | |
+ EXPORT ScaleARGBRowDownEvenBox_NEON | |
+ EXPORT ScaleARGBCols_NEON | |
+ EXPORT ScaleARGBFilterCols_NEON | |
+ EXPORT ScaleARGBCols_NEON | |
+ | |
+kShuf38 DCB 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 | |
+kShuf38_2 DCB 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 | |
+;vec16 kMult38_Div6 = { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 } | |
+kMult38_Div6 DCW 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555, 0x1555 | |
+;vec16 kMult38_Div9 = { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; | |
+kMult38_Div9 DCW 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38, 0xe38 | |
+ | |
+ | |
+; Read 32x1 throw away even pixels, and write 16x1 | |
+ScaleRowDown2_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_width | |
+ vpush {q0, q1} | |
+1 | |
+ ; load even pixels into q0, odd into q1 | |
+ MEMACCESS 0 | |
+ vld2.8 {q0, q1}, [r0]! | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {q1}, [r2]! ; store odd pixels | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ | |
+ bx lr | |
+ ENDP | |
+ | |
+; Read 32x1 average down and write 16x1. | |
+ScaleRowDown2Linear_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_width | |
+ vpush {q0, q1} | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0, q1}, [r0]! ; load pixels and post inc | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ vpaddl.u8 q0, q0 ; add adjacent | |
+ vpaddl.u8 q1, q1 | |
+ vrshrn.u16 d0, q0, #1 ; downshift, round and pack | |
+ vrshrn.u16 d1, q1, #1 | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r2]! | |
+ bgt %b1 | |
+ vpop {q0, q1} | |
+ | |
+ bx lr | |
+ ENDP | |
+ | |
+; Read 32x2 average down and write 16x1 | |
+ScaleRowDown2Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_width | |
+ ; This file was created from a .asm file | |
+ vpush {q0, q1, q2, q3} | |
+ add r1, r0 | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0, q1}, [r0]! ; load row 1 and post inc | |
+ MEMACCESS 1 | |
+ vld1.8 {q2, q3}, [r1]! ; load row 2 and post inc | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ vpaddl.u8 q0, q0 ; row 1 add adjacent | |
+ vpaddl.u8 q1, q1 | |
+ vpadal.u8 q0, q2 ; row 2 add adjacent + row1 | |
+ vpadal.u8 q1, q3 | |
+ vrshrn.u16 d0, q0, #2 ; downshift, round and pack | |
+ vrshrn.u16 d1, q1, #2 | |
+ MEMACCESS 2 | |
+ vst1.8 {q0}, [r2]! | |
+ bgt %b1 | |
+ vpop {q0, q1, q2, q3} | |
+ | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown4_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0 | |
+ subs r3, r3, #8 ; 8 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {d2}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0, q1} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown4Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ push {r4-r6} | |
+ vpush {q0-q3} | |
+ add r4, r0, r1 ; src_ptr + src_stride | |
+ add r5, r4, r1 ; src_ptr + src_stride * 2 | |
+ add r6, r5, r1 ; src_ptr + src_stride * 3 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r0]! ; load up 16x4 | |
+ MEMACCESS 3 | |
+ vld1.8 {q1}, [r4]! | |
+ MEMACCESS 4 | |
+ vld1.8 {q2}, [r5]! | |
+ MEMACCESS 5 | |
+ vld1.8 {q3}, [r6]! | |
+ subs r3, r3, #4 | |
+ vpaddl.u8 q0, q0 | |
+ vpadal.u8 q0, q1 | |
+ vpadal.u8 q0, q2 | |
+ vpadal.u8 q0, q3 | |
+ vpaddl.u16 q0, q0 | |
+ vrshrn.u32 d0, q0, #4 ; divide by 16 w/rounding | |
+ vmovn.u16 d0, q0 | |
+ MEMACCESS 1 | |
+ vst1.32 {d0[0]}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0-q3} | |
+ pop {r4-r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown34_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {d0-d3} | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0 | |
+ subs r3, r3, #24 | |
+ vmov d2, d3 ; order d0, d1, d2 | |
+ MEMACCESS 1 | |
+ vst3.8 {d0, d1, d2}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {d0-d3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown34_0_Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {q0-q3} | |
+ vpush {q8-q11} | |
+ vpush {d24} | |
+ | |
+ vmov.u8 d24, #3 | |
+ add r1, r0 | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0 | |
+ MEMACCESS 3 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; src line 1 | |
+ subs r3, r3, #24 | |
+ | |
+ ; filter src line 0 with src line 1 | |
+ ; expand chars to shorts to allow for room | |
+ ; when adding lines together | |
+ vmovl.u8 q8, d4 | |
+ vmovl.u8 q9, d5 | |
+ vmovl.u8 q10, d6 | |
+ vmovl.u8 q11, d7 | |
+ | |
+ ; 3 * line_0 + line_1 | |
+ vmlal.u8 q8, d0, d24 | |
+ vmlal.u8 q9, d1, d24 | |
+ vmlal.u8 q10, d2, d24 | |
+ vmlal.u8 q11, d3, d24 | |
+ | |
+ ; (3 * line_0 + line_1) >> 2 | |
+ vqrshrn.u16 d0, q8, #2 | |
+ vqrshrn.u16 d1, q9, #2 | |
+ vqrshrn.u16 d2, q10, #2 | |
+ vqrshrn.u16 d3, q11, #2 | |
+ | |
+ ; a0 = (src[0] * 3 + s[1] * 1) >> 2 | |
+ vmovl.u8 q8, d1 | |
+ vmlal.u8 q8, d0, d24 | |
+ vqrshrn.u16 d0, q8, #2 | |
+ | |
+ ; a1 = (src[1] * 1 + s[2] * 1) >> 1 | |
+ vrhadd.u8 d1, d1, d2 | |
+ | |
+ ; a2 = (src[2] * 1 + s[3] * 3) >> 2 | |
+ vmovl.u8 q8, d2 | |
+ vmlal.u8 q8, d3, d24 | |
+ vqrshrn.u16 d2, q8, #2 | |
+ | |
+ MEMACCESS 1 | |
+ vst3.8 {d0, d1, d2}, [r2]! | |
+ | |
+ bgt %b1 | |
+ | |
+ | |
+ vpop {d24} | |
+ vpop {q8-q11} | |
+ vpop {q0-q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown34_1_Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {q0-q3} | |
+ vpush {d24} | |
+ vmov.u8 d24, #3 | |
+ add r1, r0 | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! ; src line 0 | |
+ MEMACCESS 3 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! ; src line 1 | |
+ subs r3, r3, #24 | |
+ ; average src line 0 with src line 1 | |
+ vrhadd.u8 q0, q0, q2 | |
+ vrhadd.u8 q1, q1, q3 | |
+ | |
+ ; a0 = (src[0] * 3 + s[1] * 1) >> 2 | |
+ vmovl.u8 q3, d1 | |
+ vmlal.u8 q3, d0, d24 | |
+ vqrshrn.u16 d0, q3, #2 | |
+ | |
+ ; a1 = (src[1] * 1 + s[2] * 1) >> 1 | |
+ vrhadd.u8 d1, d1, d2 | |
+ | |
+ ; a2 = (src[2] * 1 + s[3] * 3) >> 2 | |
+ vmovl.u8 q3, d2 | |
+ vmlal.u8 q3, d3, d24 | |
+ vqrshrn.u16 d2, q3, #2 | |
+ | |
+ MEMACCESS 1 | |
+ vst3.8 {d0, d1, d2}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {d24} | |
+ vpop {q0-q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown38_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {d0-d5} | |
+ push {r4} | |
+ | |
+ adr R4, kShuf38 | |
+ | |
+ vld1.8 {q3}, [r4] | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0, d1, d2, d3}, [r0]! | |
+ subs r3, r3, #12 | |
+ vtbl.u8 d4, {d0, d1, d2, d3}, d6 | |
+ vtbl.u8 d5, {d0, d1, d2, d3}, d7 | |
+ MEMACCESS(1) | |
+ vst1.8 {d4}, [r2]! | |
+ MEMACCESS(1) | |
+ vst1.32 {d5[0]}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {d0-d5} | |
+ pop {r4} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown38_3_Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {q0-q3} | |
+ vpush {q8, q9} | |
+ vpush {q13-q15} | |
+ push {r4-r7} | |
+ add r4, r0, r1 | |
+ add r4, r4, r1 ; src_ptr + src_stride * 2 | |
+ adr r5, kMult38_Div6 | |
+ adr r6, kShuf38_2 | |
+ adr r7, kMult38_Div9 | |
+ | |
+ MEMACCESS 5 | |
+ vld1.16 {q13}, [r5] | |
+ MEMACCESS 6 | |
+ vld1.8 {q14}, [r6] | |
+ MEMACCESS 7 | |
+ vld1.8 {q15}, [r7] | |
+ add r1, r0 | |
+1 | |
+ ; d0 = 00 40 01 41 02 42 03 43 | |
+ ; d1 = 10 50 11 51 12 52 13 53 | |
+ ; d2 = 20 60 21 61 22 62 23 63 | |
+ ; d3 = 30 70 31 71 32 72 33 73 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! | |
+ MEMACCESS 3 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! | |
+ MEMACCESS 4 | |
+ vld4.8 {d16, d17, d18, d19}, [r4]! | |
+ subs r3, r3, #12 | |
+ | |
+ ; Shuffle the input data around to get align the data | |
+ ; so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | |
+ ; d0 = 00 10 01 11 02 12 03 13 | |
+ ; d1 = 40 50 41 51 42 52 43 53 | |
+ vtrn.u8 d0, d1 | |
+ vtrn.u8 d4, d5 | |
+ vtrn.u8 d16, d17 | |
+ | |
+ ; d2 = 20 30 21 31 22 32 23 33 | |
+ ; d3 = 60 70 61 71 62 72 63 73 | |
+ vtrn.u8 d2, d3 | |
+ vtrn.u8 d6, d7 | |
+ vtrn.u8 d18, d19 | |
+ | |
+ ; d0 = 00+10 01+11 02+12 03+13 | |
+ ; d2 = 40+50 41+51 42+52 43+53 | |
+ vpaddl.u8 q0, q0 | |
+ vpaddl.u8 q2, q2 | |
+ vpaddl.u8 q8, q8 | |
+ | |
+ ; d3 = 60+70 61+71 62+72 63+73 | |
+ vpaddl.u8 d3, d3 | |
+ vpaddl.u8 d7, d7 | |
+ vpaddl.u8 d19, d19 | |
+ | |
+ ; combine source lines | |
+ vadd.u16 q0, q2 | |
+ vadd.u16 q0, q8 | |
+ vadd.u16 d4, d3, d7 | |
+ vadd.u16 d4, d19 | |
+ | |
+ ; dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] | |
+ ; + s[6 + st * 1] + s[7 + st * 1] | |
+ ; + s[6 + st * 2] + s[7 + st * 2]) / 6 | |
+ vqrdmulh.s16 q2, q2, q13 | |
+ vmovn.u16 d4, q2 | |
+ | |
+ ; Shuffle 2,3 reg around so that 2 can be added to the | |
+ ; 0,1 reg and 3 can be added to the 4,5 reg. This | |
+ ; requires expanding from u8 to u16 as the 0,1 and 4,5 | |
+ ; registers are already expanded. Then do transposes | |
+ ; to get aligned. | |
+ ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | |
+ vmovl.u8 q1, d2 | |
+ vmovl.u8 q3, d6 | |
+ vmovl.u8 q9, d18 | |
+ | |
+ ; combine source lines | |
+ vadd.u16 q1, q3 | |
+ vadd.u16 q1, q9 | |
+ | |
+ ; d4 = xx 20 xx 30 xx 22 xx 32 | |
+ ; d5 = xx 21 xx 31 xx 23 xx 33 | |
+ vtrn.u32 d2, d3 | |
+ | |
+ ; d4 = xx 20 xx 21 xx 22 xx 23 | |
+ ; d5 = xx 30 xx 31 xx 32 xx 33 | |
+ vtrn.u16 d2, d3 | |
+ | |
+ ; 0+1+2, 3+4+5 | |
+ vadd.u16 q0, q1 | |
+ | |
+ ; Need to divide, but can't downshift as the the value | |
+ ; isn't a power of 2. So multiply by 65536 / n | |
+ ; and take the upper 16 bits. | |
+ vqrdmulh.s16 q0, q0, q15 | |
+ | |
+ ; Align for table lookup, vtbl requires registers to | |
+ ; be adjacent | |
+ vmov.u8 d2, d4 | |
+ | |
+ vtbl.u8 d3, {d0, d1, d2}, d28 | |
+ vtbl.u8 d4, {d0, d1, d2}, d29 | |
+ | |
+ MEMACCESS 1 | |
+ vst1.8 {d3}, [r2]! | |
+ MEMACCESS 1 | |
+ vst1.32 {d4[0]}, [r2]! | |
+ bgt %b1 | |
+ | |
+ pop {r4-r7} | |
+ vpop {q13-q15} | |
+ vpop {q8, q9} | |
+ vpop {q0-q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleRowDown38_2_Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint8* dst_ptr | |
+ ; r3 = int dst_width | |
+ vpush {q0-q3} | |
+ vpush {q13-q14} | |
+ push {r4, r5} | |
+ adr r4, kMult38_Div6 | |
+ adr r5, kShuf38_2 | |
+ | |
+ MEMACCESS 4 | |
+ vld1.16 {q13}, [r4] | |
+ MEMACCESS 5 | |
+ vld1.8 {q14}, [r5] | |
+ add r1, r0 | |
+1 | |
+ ; d0 = 00 40 01 41 02 42 03 43 | |
+ ; d1 = 10 50 11 51 12 52 13 53 | |
+ ; d2 = 20 60 21 61 22 62 23 63 | |
+ ; d3 = 30 70 31 71 32 72 33 73 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d1, d2, d3}, [r0]! | |
+ MEMACCESS 3 | |
+ vld4.8 {d4, d5, d6, d7}, [r1]! | |
+ subs r3, r3, #12 | |
+ | |
+ ; Shuffle the input data around to get align the data | |
+ ; so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | |
+ ; d0 = 00 10 01 11 02 12 03 13 | |
+ ; d1 = 40 50 41 51 42 52 43 53 | |
+ vtrn.u8 d0, d1 | |
+ vtrn.u8 d4, d5 | |
+ | |
+ ; d2 = 20 30 21 31 22 32 23 33 | |
+ ; d3 = 60 70 61 71 62 72 63 73 | |
+ vtrn.u8 d2, d3 | |
+ vtrn.u8 d6, d7 | |
+ | |
+ ; d0 = 00+10 01+11 02+12 03+13 | |
+ ; d2 = 40+50 41+51 42+52 43+53 | |
+ vpaddl.u8 q0, q0 | |
+ vpaddl.u8 q2, q2 | |
+ | |
+ ; d3 = 60+70 61+71 62+72 63+73 | |
+ vpaddl.u8 d3, d3 | |
+ vpaddl.u8 d7, d7 | |
+ | |
+ ; combine source lines | |
+ vadd.u16 q0, q2 | |
+ vadd.u16 d4, d3, d7 | |
+ | |
+ ; dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 | |
+ vqrshrn.u16 d4, q2, #2 | |
+ | |
+ ; Shuffle 2,3 reg around so that 2 can be added to the | |
+ ; 0,1 reg and 3 can be added to the 4,5 reg. This | |
+ ; requires expanding from u8 to u16 as the 0,1 and 4,5 | |
+ ; registers are already expanded. Then do transposes | |
+ ; to get aligned. | |
+ ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | |
+ vmovl.u8 q1, d2 | |
+ vmovl.u8 q3, d6 | |
+ | |
+ ; combine source lines | |
+ vadd.u16 q1, q3 | |
+ | |
+ ; d4 = xx 20 xx 30 xx 22 xx 32 | |
+ ; d5 = xx 21 xx 31 xx 23 xx 33 | |
+ vtrn.u32 d2, d3 | |
+ | |
+ ; d4 = xx 20 xx 21 xx 22 xx 23 | |
+ ; d5 = xx 30 xx 31 xx 32 xx 33 | |
+ vtrn.u16 d2, d3 | |
+ | |
+ ; 0+1+2, 3+4+5 | |
+ vadd.u16 q0, q1 | |
+ | |
+ ; Need to divide, but can't downshift as the the value | |
+ ; isn't a power of 2. So multiply by 65536 / n | |
+ ; and take the upper 16 bits. | |
+ vqrdmulh.s16 q0, q0, q13 | |
+ | |
+ ; Align for table lookup, vtbl requires registers to | |
+ ; be adjacent | |
+ vmov.u8 d2, d4 | |
+ | |
+ vtbl.u8 d3, {d0, d1, d2}, d28 | |
+ vtbl.u8 d4, {d0, d1, d2}, d29 | |
+ | |
+ MEMACCESS 1 | |
+ vst1.8 {d3}, [r2]! | |
+ MEMACCESS 1 | |
+ vst1.32 {d4[0]}, [r2]! | |
+ bgt %b1 | |
+ | |
+ pop {r4, r5} | |
+ vpop {q13-q14} | |
+ vpop {q0-q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleAddRows_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = src_stride | |
+ ; r2 = uint16* dst_ptr | |
+ ; r3 = int dst_width | |
+ push {r4, r5, r12} | |
+ ldr r4, [SP, #12] ; int src_height | |
+ mov r5, 0 | |
+ vpush {q0-q3} | |
+ | |
+1 | |
+ mov r5, r0 | |
+ mov r12, r4 | |
+ veor q2, q2, q2 | |
+ veor q3, q3, q3 | |
+2 | |
+ ; load 16 pixels into q0 | |
+ MEMACCESS 0 | |
+ vld1.8 {q0}, [r5], r1 | |
+ vaddw.u8 q3, q3, d1 | |
+ vaddw.u8 q2, q2, d0 | |
+ subs r12, r12, #1 | |
+ bgt %b2 | |
+ MEMACCESS 2 | |
+ vst1.16 {q2, q3}, [r2]! ; store pixels | |
+ add r0, r0, #16 | |
+ subs r3, r3, #16 ; 16 processed per loop | |
+ bgt %b1 | |
+ | |
+ vpop {q0-q3} | |
+ pop {r4, r5, r12} | |
+ bx lr | |
+ ENDP | |
+ | |
+; TODO(Yang Zhang): Investigate less load instructions for | |
+; the x/dx stepping | |
+ MACRO | |
+ LOAD2_DATA8_LANE $n | |
+ lsr r5, r3, #16 | |
+ add r6, r1, r5 | |
+ add r3, r3, r4 | |
+ MEMACCESS 6 | |
+ vld2.8 {d6[$n], d7[$n]}, [r6] | |
+ MEND | |
+ | |
+dx_offset DCD 0, 1, 2, 3 | |
+ | |
+; The NEON version mimics this formula: | |
+; #define BLENDER(a, b, f) (uint8)((int)(a) + | |
+; ((int)(f) * ((int)(b) - (int)(a)) >> 16)) | |
+ | |
+ScaleFilterCols_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* dst_ptr | |
+ ; r1 = uint8* src_ptr | |
+ ; r2 = int dst_width | |
+ ; r3 = int x | |
+ | |
+ push {r4-r6} | |
+ | |
+ ldr r4, [sp, #12] ; int dx | |
+ adr r5, dx_offset | |
+ mov r6, r1 | |
+ | |
+ vpush {q0-q3} | |
+ vpush {q8-q13} | |
+ | |
+ vdup.32 q0, r3 ; x | |
+ vdup.32 q1, r4 ; dx | |
+ vld1.32 {q2}, [r5] ; 0 1 2 3 | |
+ vshl.i32 q3, q1, #2 ; 4 * dx | |
+ vmul.s32 q1, q1, q2 | |
+ ; x , x + 1 * dx, x + 2 * dx, x + 3 * dx | |
+ vadd.s32 q1, q1, q0 | |
+ ; x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx | |
+ vadd.s32 q2, q1, q3 | |
+ vshl.i32 q0, q3, #1 ; 8 * dx | |
+1 | |
+ LOAD2_DATA8_LANE 0 | |
+ LOAD2_DATA8_LANE 1 | |
+ LOAD2_DATA8_LANE 2 | |
+ LOAD2_DATA8_LANE 3 | |
+ LOAD2_DATA8_LANE 4 | |
+ LOAD2_DATA8_LANE 5 | |
+ LOAD2_DATA8_LANE 6 | |
+ LOAD2_DATA8_LANE 7 | |
+ vmov q10, q1 | |
+ vmov q11, q2 | |
+ vuzp.16 q10, q11 | |
+ vmovl.u8 q8, d6 | |
+ vmovl.u8 q9, d7 | |
+ vsubl.s16 q11, d18, d16 | |
+ vsubl.s16 q12, d19, d17 | |
+ vmovl.u16 q13, d20 | |
+ vmovl.u16 q10, d21 | |
+ vmul.s32 q11, q11, q13 | |
+ vmul.s32 q12, q12, q10 | |
+ vrshrn.s32 d18, q11, #16 | |
+ vrshrn.s32 d19, q12, #16 | |
+ vadd.s16 q8, q8, q9 | |
+ vmovn.s16 d6, q8 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.8 {d6}, [r0]! ; store pixels | |
+ vadd.s32 q1, q1, q0 | |
+ vadd.s32 q2, q2, q0 | |
+ subs r2, r2, #8 ; 8 processed per loop | |
+ bgt %b1 | |
+ | |
+ vpop {q8-q13} | |
+ vpop {q0-q3} | |
+ pop {r4-r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleARGBRowDown2_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = ptrdiff_t src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_width | |
+ vpush {q0 - q3} | |
+1 | |
+ ; load even pixels into q0, odd into q1 | |
+ MEMACCESS 0 | |
+ vld2.32 {q0, q1}, [r0]! | |
+ MEMACCESS 0 | |
+ vld2.32 {q2, q3}, [r0]! | |
+ subs r3, r3, #8 ; 8 processed per loop | |
+ MEMACCESS 1 | |
+ vst1.8 {q1}, [r2]! ; store odd pixels | |
+ MEMACCESS 1 | |
+ vst1.8 {q3}, [r2]! | |
+ bgt %b1 | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ | |
+ | |
+ScaleARGBRowDown2Linear_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_argb | |
+ ; r1 = ptrdiff_t src_stride | |
+ ; r2 = uint8* dst_argb | |
+ ; r3 = int dst_width | |
+ vpush {q0 - q3} | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 ARGB pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 ARGB pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop | |
+ vpaddl.u8 q0, q0 ; B 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; G 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; R 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q3, q3 ; A 16 bytes -> 8 shorts. | |
+ vrshrn.u16 d0, q0, #1 ; downshift, round and pack | |
+ vrshrn.u16 d1, q1, #1 | |
+ vrshrn.u16 d2, q2, #1 | |
+ vrshrn.u16 d3, q3, #1 | |
+ MEMACCESS 1 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! | |
+ bgt %b1 | |
+ | |
+ | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleARGBRowDown2Box_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_ptr | |
+ ; r1 = ptrdiff_t src_stride | |
+ ; r2 = uint8* dst | |
+ ; r3 = int dst_width | |
+ vpush {q0 - q3} | |
+ vpush {q8 - q11} | |
+ ; change the stride to row 2 pointer | |
+ add r1, r1, r0 | |
+ | |
+1 | |
+ MEMACCESS 0 | |
+ vld4.8 {d0, d2, d4, d6}, [r0]! ; load 8 argb pixels. | |
+ MEMACCESS 0 | |
+ vld4.8 {d1, d3, d5, d7}, [r0]! ; load next 8 argb pixels. | |
+ subs r3, r3, #8 ; 8 processed per loop. | |
+ vpaddl.u8 q0, q0 ; b 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q1, q1 ; g 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q2, q2 ; r 16 bytes -> 8 shorts. | |
+ vpaddl.u8 q3, q3 ; a 16 bytes -> 8 shorts. | |
+ MEMACCESS 1 | |
+ vld4.8 {d16, d18, d20, d22}, [r1]! ; load 8 more argb pixels. | |
+ MEMACCESS 1 | |
+ vld4.8 {d17, d19, d21, d23}, [r1]! ; load last 8 argb pixels. | |
+ vpadal.u8 q0, q8 ; b 16 bytes -> 8 shorts. | |
+ vpadal.u8 q1, q9 ; g 16 bytes -> 8 shorts. | |
+ vpadal.u8 q2, q10 ; r 16 bytes -> 8 shorts. | |
+ vpadal.u8 q3, q11 ; a 16 bytes -> 8 shorts. | |
+ vrshrn.u16 d0, q0, #2 ; downshift, round and pack | |
+ vrshrn.u16 d1, q1, #2 | |
+ vrshrn.u16 d2, q2, #2 | |
+ vrshrn.u16 d3, q3, #2 | |
+ MEMACCESS 2 | |
+ vst4.8 {d0, d1, d2, d3}, [r2]! | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q11} | |
+ vpop {q0 - q3} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleARGBRowDownEven_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_argb | |
+ ; r1 = ptrdiff_t src_stride | |
+ ; r2 = int src_stepx | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r12} | |
+ ldr r4, [sp, #8] ;int dst_width | |
+ vpush {q0} | |
+ | |
+ mov r12, r2, lsl #2 | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.32 {d0[0]}, [r0], r12 | |
+ MEMACCESS 0 | |
+ vld1.32 {d0[1]}, [r0], r12 | |
+ MEMACCESS 0 | |
+ vld1.32 {d1[0]}, [r0], r12 | |
+ MEMACCESS 0 | |
+ vld1.32 {d1[1]}, [r0], r12 | |
+ subs r4, r4, #4 ; 4 pixels per loop. | |
+ MEMACCESS 1 | |
+ vst1.8 {q0}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0} | |
+ pop {r4, r12} | |
+ bx lr | |
+ ENDP | |
+ | |
+ScaleARGBRowDownEvenBox_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* src_argb | |
+ ; r1 = ptrdiff_t src_stride | |
+ ; r2 = int src_stepx | |
+ ; r3 = uint8* dst_argb | |
+ push {r4, r12} | |
+ ldr r4, [sp, #8] ;int dst_width | |
+ vpush {q0 - q3} | |
+ | |
+ mov r12, r2, lsl #2 | |
+ add r1, r1, r0 | |
+1 | |
+ MEMACCESS 0 | |
+ vld1.8 {d0}, [r0], r12 ; Read 4 2x2 blocks -> 2x1 | |
+ MEMACCESS 1 | |
+ vld1.8 {d1}, [r1], r12 | |
+ MEMACCESS 0 | |
+ vld1.8 {d2}, [r0], r12 | |
+ MEMACCESS 1 | |
+ vld1.8 {d3}, [r1], r12 | |
+ MEMACCESS 0 | |
+ vld1.8 {d4}, [r0], r12 | |
+ MEMACCESS 1 | |
+ vld1.8 {d5}, [r1], r12 | |
+ MEMACCESS 0 | |
+ vld1.8 {d6}, [r0], r12 | |
+ MEMACCESS 1 | |
+ vld1.8 {d7}, [r1], r12 | |
+ vaddl.u8 q0, d0, d1 | |
+ vaddl.u8 q1, d2, d3 | |
+ vaddl.u8 q2, d4, d5 | |
+ vaddl.u8 q3, d6, d7 | |
+ vswp.8 d1, d2 ; ab_cd -> ac_bd | |
+ vswp.8 d5, d6 ; ef_gh -> eg_fh | |
+ vadd.u16 q0, q0, q1 ; (a+b)_(c+d) | |
+ vadd.u16 q2, q2, q3 ; (e+f)_(g+h) | |
+ vrshrn.u16 d0, q0, #2 ; first 2 pixels. | |
+ vrshrn.u16 d1, q2, #2 ; next 2 pixels. | |
+ subs r4, r4, #4 ; 4 pixels per loop. | |
+ MEMACCESS 2 | |
+ vst1.8 {q0}, [r3]! | |
+ bgt %b1 | |
+ | |
+ vpop {q0 - q3} | |
+ pop {r4, r12} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; TODO(Yang Zhang): Investigate less load instructions for | |
+ ; the x/dx stepping | |
+ MACRO | |
+ LOAD1_DATA32_LANE $dn, $n | |
+ lsr r5, r3, #16 | |
+ add r6, r1, r5, lsl #2 | |
+ add r3, r3, r4 | |
+ MEMACCESS 6 | |
+ vld1.32 {$dn[$n]}, [r6] | |
+ MEND | |
+ | |
+ScaleARGBCols_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* dst_argb | |
+ ; r1 = const uint8* src_argb | |
+ ; r2 = int dst_width | |
+ ; r3 = int x | |
+ push {r4 - r6} | |
+ ldr r4, [sp,#12] ; int dx | |
+ mov r6, r1 | |
+ vpush {q0, q1} | |
+ | |
+1 | |
+ LOAD1_DATA32_LANE d0, 0 | |
+ LOAD1_DATA32_LANE d0, 1 | |
+ LOAD1_DATA32_LANE d1, 0 | |
+ LOAD1_DATA32_LANE d1, 1 | |
+ LOAD1_DATA32_LANE d2, 0 | |
+ LOAD1_DATA32_LANE d2, 1 | |
+ LOAD1_DATA32_LANE d3, 0 | |
+ LOAD1_DATA32_LANE d3, 1 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.32 {q0, q1}, [r0]! ; store pixels | |
+ subs r2, r2, #8 ; 8 processed per loop | |
+ bgt %b1 | |
+ | |
+ | |
+ vpop {q0, q1} | |
+ pop {r4 - r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+ ; TODO(Yang Zhang): Investigate less load instructions for | |
+ ; the x/dx stepping | |
+ MACRO | |
+ LOAD2_DATA32_LANE $dn1, $dn2, $n | |
+ lsr r5, r3, #16 | |
+ add r6, r1, r5, lsl #2 | |
+ add r3, r3, r4 | |
+ MEMACCESS 6 | |
+ vld2.32 {$dn1[$n], $dn2[$n]}, [r6] | |
+ MEND | |
+ | |
+ScaleARGBFilterCols_NEON PROC | |
+ ; input | |
+ ; r0 = uint8* dst_argb | |
+ ; r1 = const uint8* src_argb | |
+ ; r2 = int dst_width | |
+ ; r3 = int x | |
+ | |
+ push {r4 - r6} | |
+ ldr r4, [sp,#12] ;int dx | |
+ adr r5, dx_offset | |
+ mov r6, r1 | |
+ vpush {q0 - q3} | |
+ vpush {q8 - q15} | |
+ | |
+ vdup.32 q0, r3 ; x | |
+ vdup.32 q1, r4 ; dx | |
+ vld1.32 {q2}, [r5] ; 0 1 2 3 | |
+ vshl.i32 q9, q1, #2 ; 4 * dx | |
+ vmul.s32 q1, q1, q2 | |
+ vmov.i8 q3, #0x7f ; 0x7F | |
+ vmov.i16 q15, #0x7f ; 0x7F | |
+ ; x , x + 1 * dx, x + 2 * dx, x + 3 * dx | |
+ vadd.s32 q8, q1, q0 | |
+1 | |
+ ; d0, d1: a | |
+ ; d2, d3: b | |
+ LOAD2_DATA32_LANE d0, d2, 0 | |
+ LOAD2_DATA32_LANE d0, d2, 1 | |
+ LOAD2_DATA32_LANE d1, d3, 0 | |
+ LOAD2_DATA32_LANE d1, d3, 1 | |
+ vshrn.i32 d22, q8, #9 | |
+ vand.16 d22, d22, d30 | |
+ vdup.8 d24, d22[0] | |
+ vdup.8 d25, d22[2] | |
+ vdup.8 d26, d22[4] | |
+ vdup.8 d27, d22[6] | |
+ vext.8 d4, d24, d25, #4 | |
+ vext.8 d5, d26, d27, #4 ; f | |
+ veor.8 q10, q2, q3 ; 0x7f ^ f | |
+ vmull.u8 q11, d0, d20 | |
+ vmull.u8 q12, d1, d21 | |
+ vmull.u8 q13, d2, d4 | |
+ vmull.u8 q14, d3, d5 | |
+ vadd.i16 q11, q11, q13 | |
+ vadd.i16 q12, q12, q14 | |
+ vshrn.i16 d0, q11, #7 | |
+ vshrn.i16 d1, q12, #7 | |
+ | |
+ MEMACCESS 0 | |
+ vst1.32 {d0, d1}, [r0]! ; store pixels | |
+ vadd.s32 q8, q8, q9 | |
+ subs r2, r2, #4 ; 4 processed per loop | |
+ bgt %b1 | |
+ | |
+ vpop {q8 - q15} | |
+ vpop {q0 - q3} | |
+ pop {r4 - r6} | |
+ bx lr | |
+ ENDP | |
+ | |
+ END | |
+ | |
+ | |
+ | |
diff --git a/source/scale_neon.cc b/source/scale_neon.cc | |
index 9b4dce3..f98c939 100644 | |
--- a/source/scale_neon.cc | |
+++ b/source/scale_neon.cc | |
@@ -15,6 +15,16 @@ namespace libyuv { | |
extern "C" { | |
#endif | |
+ /* !!! IMPORTANT: Following methods has been ported to pure assembler to scale_neon.asm, | |
+ * because MS Visual Studio doesn't support inline assembler for ARM. | |
+ * | |
+ * ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN scale_neon.asm | |
+ * | |
+ * Eventually, only pure assembler implementation should be used for all platforms | |
+ * to avoid code duplication. | |
+ */ | |
+ | |
+ | |
// This module is for GCC Neon. | |
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ | |
!defined(__aarch64__) | |
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc | |
index 7f8bcf8..9dd7ae4 100644 | |
--- a/unit_test/unit_test.cc | |
+++ b/unit_test/unit_test.cc | |
@@ -388,7 +388,7 @@ LibYUVBaseTest::LibYUVBaseTest() | |
1280.0); | |
} | |
-int main(int argc, char** argv) { | |
+int main(int argc, char* argv[]) { | |
::testing::InitGoogleTest(&argc, argv); | |
// AllowCommandLineParsing allows us to ignore flags passed on to us by | |
// Chromium build bots without having to explicitly disable them. | |
diff --git a/util/compare.cc b/util/compare.cc | |
index ef0beef..5066b61 100644 | |
--- a/util/compare.cc | |
+++ b/util/compare.cc | |
@@ -17,7 +17,7 @@ | |
#include "libyuv/compare.h" | |
#include "libyuv/version.h" | |
-int main(int argc, char** argv) { | |
+int main(int argc, char* argv[]) { | |
if (argc < 1) { | |
printf("libyuv compare v%d\n", LIBYUV_VERSION); | |
printf("compare file1.yuv file2.yuv\n"); | |
@@ -62,4 +62,5 @@ int main(int argc, char** argv) { | |
fclose(fin2); | |
} | |
fclose(fin1); | |
+ return 0; | |
} | |
diff --git a/util/convert.cc b/util/convert.cc | |
index acaf43a..3cb0b51 100644 | |
--- a/util/convert.cc | |
+++ b/util/convert.cc | |
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) { | |
exit(0); | |
} | |
-void ParseOptions(int argc, const char* argv[]) { | |
+void ParseOptions(int argc, char* argv[]) { | |
if (argc <= 1) | |
PrintHelp(argv[0]); | |
for (int c = 1; c < argc; ++c) { | |
@@ -189,7 +189,7 @@ static int TileARGBScale(const uint8* src_argb, | |
return 0; | |
} | |
-int main(int argc, const char* argv[]) { | |
+int main(int argc, char* argv[]) { | |
ParseOptions(argc, argv); | |
// Open original file (first file argument) | |
diff --git a/util/psnr_main.cc b/util/psnr_main.cc | |
index 01e8777..2d412a3 100644 | |
--- a/util/psnr_main.cc | |
+++ b/util/psnr_main.cc | |
@@ -149,7 +149,7 @@ void PrintHelp(const char* program) { | |
exit(0); | |
} | |
-void ParseOptions(int argc, const char* argv[]) { | |
+void ParseOptions(int argc, char* argv[]) { | |
if (argc <= 1) | |
PrintHelp(argv[0]); | |
for (int c = 1; c < argc; ++c) { | |
@@ -315,7 +315,7 @@ bool UpdateMetrics(uint8* ch_org, | |
return ismin; | |
} | |
-int main(int argc, const char* argv[]) { | |
+int main(int argc, char* argv[]) { | |
ParseOptions(argc, argv); | |
if (!do_psnr && !do_ssim) { | |
do_psnr = true; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment