robin-raymond/libyuv.diff

## libyuv.diff
diff --git a/.gitignore b/.gitignore
index b0e9574..5360877 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,6 @@ source/*.o
 # Files generated by perf
 perf.data
 perf.data.old
+*.vcxproj.filters
+*.vcxproj
+libyuv*.sln
\ No newline at end of file
diff --git a/BUILD.gn b/BUILD.gn
index 4f56cdc..1c02a46 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -40,9 +40,15 @@ group("libyuv") {
   public_configs = [ ":libyuv_config" ]

   if (is_win && target_cpu == "x64") {
-    public_deps = [
-      ":libyuv_internal(//build/toolchain/win:clang_x64)",
-    ]
+    if (is_winuwp) {
+      public_deps = [
+        ":libyuv_internal",
+      ]
+    } else {
+      public_deps = [
+        ":libyuv_internal(//build/toolchain/win:clang_x64)",
+      ]
+    }
   } else {
     public_deps = [
       ":libyuv_internal",
@@ -119,6 +125,9 @@ static_library("libyuv_internal") {
     defines += [ "HAVE_JPEG" ]
     deps += [ "//third_party:jpeg" ]
   }
+  if (is_winuwp) {
+    deps += [ "//third_party/winuwp_compat:force_include_std" ]
+  }

   if (libyuv_use_neon) {
     deps += [ ":libyuv_neon" ]
@@ -257,6 +266,9 @@ if (libyuv_include_tests) {
     if (is_android) {
       deps += [ "//testing/android/native_test:native_test_native_code" ]
     }
+    if (is_winuwp) {
+      deps += [ "//third_party/winuwp_compat:force_include_std" ]
+    }

     # TODO(YangZhang): These lines can be removed when high accuracy
     # YUV to RGB to Neon is ported.
@@ -287,6 +299,9 @@ if (libyuv_include_tests) {
     if (is_linux) {
       cflags = [ "-fexceptions" ]
     }
+    if (is_winuwp) {
+      deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+    }
   }

   executable("convert") {
@@ -300,6 +315,9 @@ if (libyuv_include_tests) {
     if (is_linux) {
       cflags = [ "-fexceptions" ]
     }
+    if (is_winuwp) {
+      deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+    }
   }

   executable("psnr") {
@@ -316,6 +334,9 @@ if (libyuv_include_tests) {
     if (!is_ios && !libyuv_disable_jpeg) {
       defines = [ "HAVE_JPEG" ]
     }
+    if (is_winuwp) {
+      deps += [ "//third_party/winuwp_compat:wrap_main_utf8_cc" ]
+    }
   }

   executable("cpuid") {
diff --git a/armasm_ms.config b/armasm_ms.config
new file mode 100644
index 0000000..b617231
--- /dev/null
+++ b/armasm_ms.config
@@ -0,0 +1 @@
+-I src -oldit
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 3e5dd20..a1e4722 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -625,7 +625,7 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
   #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
 #endif  // defined(__native_client__) && defined(__x86_64__)

-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
 #undef MEMACCESS
 #if defined(__native_client__)
 #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
diff --git a/libyuv.gyp b/libyuv.gyp
index f73a1a4..6c72007 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -30,7 +30,7 @@
     'build_neon': 0,
     'build_msa': 0,
     'conditions': [
-       ['(target_arch == "armv7" or target_arch == "armv7s" or \
+       ['(OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")) or (target_arch == "armv7" or target_arch == "armv7s" or \
        (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
        and (arm_neon == 1 or arm_neon_optional == 1)', {
          'build_neon': 1,
@@ -85,6 +85,35 @@
                 # '-marm',  # arm32 not thumb
               ],
             }],
+            ['OS_RUNTIME=="winuwp" and (winuwp_platform=="win_phone" or winuwp_platform=="win10_arm")', {
+              'defines': [
+                'WINUWP',
+                '__ARM_NEON__',
+              ],
+              'sources': [
+                # sources.
+                'source/arm_asm_macros.in',
+                'source/compare_neon.asm',
+                'source/rotate_neon.asm',
+                'source/scale_neon.asm',
+                'source/row_neon.asm'
+              ],
+               'sources!': [
+                # sources.
+                'source/compare_neon.cc',
+                'source/compare_neon64.cc',
+                'source/rotate_neon.cc',
+                'source/rotate_neon64.cc',
+                'source/row_neon.cc',
+                'source/row_neon64.cc',
+                'source/scale_neon.cc',
+                'source/scale_neon64.cc',
+              ],
+            }],
+          ],
+          'include_dirs': [
+            'include',
+            '.',
           ],
         }],
         ['build_msa != 0', {
diff --git a/libyuv_test.gyp b/libyuv_test.gyp
index 88860f5..abb3c89 100644
--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -18,8 +18,8 @@
       'type': '<(gtest_target_type)',
       'dependencies': [
         'libyuv.gyp:libyuv',
-        'testing/gtest.gyp:gtest',
-        'third_party/gflags/gflags.gyp:gflags',
+        '<(DEPTH)/testing/gtest.gyp:gtest',
+        '<(DEPTH)/third_party/gflags/gflags.gyp:gflags',
       ],
       'direct_dependent_settings': {
         'defines': [
@@ -49,6 +49,11 @@
         'unit_test/video_common_test.cc',
       ],
       'conditions': [
+        ['OS=="win" and OS_RUNTIME=="winuwp"', {
+          'defines': [
+            'WINUWP',
+          ],
+        }],
         ['OS=="linux"', {
           'cflags': [
             '-fexceptions',
@@ -83,6 +88,7 @@
         [ '(target_arch == "armv7" or target_arch == "armv7s" \
           or (target_arch == "arm" and arm_version >= 7) \
           or target_arch == "arm64") \
+          or winuwp_platform=="win_phone" or winuwp_platform=="win10_arm" \
           and (arm_neon == 1 or arm_neon_optional == 1)', {
           'defines': [
             'LIBYUV_NEON'
@@ -185,7 +191,7 @@
             'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
           },
           'includes': [
-            'build/apk_test.gypi',
+            # 'build/apk_test.gypi',
           ],
           'dependencies': [
             'libyuv_unittest',
diff --git a/source/arm_asm_macros.in b/source/arm_asm_macros.in
new file mode 100644
index 0000000..eb54c4b
--- /dev/null
+++ b/source/arm_asm_macros.in
@@ -0,0 +1,22 @@
+;
+;  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS. All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+  AREA  |.text|, CODE, READONLY
+
+  MACRO
+  MEMACCESS $base
+	; Alternative of MEMACCESS macro defined in row.h.
+	; Currently assembler source files are used only for Windows Phone (MS armasm compiler),
+	; so this macro is empty. It is defined for code compatibility.
+	; Eventually asm. source files should be used for all platforms, so some platforms might require
+	; some implementation. See row.h (define MEMACCESS) for details.
+  MEND
+
+  END
diff --git a/source/compare_neon.asm b/source/compare_neon.asm
new file mode 100644
index 0000000..85251f1
--- /dev/null
+++ b/source/compare_neon.asm
@@ -0,0 +1,56 @@
+;
+;  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS. All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+  AREA  |.text|, CODE, READONLY, ALIGN=2
+
+  GET    source/arm_asm_macros.in
+
+  EXPORT SumSquareError_NEON
+
+SumSquareError_NEON PROC
+  ; input
+  ;		r0 = uint8* src_a
+  ;		r1 = uint8* src_b
+  ;		r3 = int count
+  ; output
+  ;	  r0 = int
+  vpush			 {q0, q1, q2, q3}
+  vpush			 {q8, q9, q10, q11}
+
+  vmov.u8    q8, #0
+  vmov.u8    q10, #0
+  vmov.u8    q9, #0
+  vmov.u8    q11, #0
+loop
+  MEMACCESS	 0
+  vld1.8     {q0}, [r0]!
+  MEMACCESS  1
+  vld1.8     {q1}, [r1]!
+  subs       r2, r2, #16
+  vsubl.u8   q2, d0, d2
+  vsubl.u8   q3, d1, d3
+  vmlal.s16  q8, d4, d4
+  vmlal.s16  q9, d6, d6
+  vmlal.s16  q10, d5, d5
+  vmlal.s16  q11, d7, d7
+  bgt				 loop
+
+  vadd.u32   q8, q8, q9
+  vadd.u32   q10, q10, q11
+  vadd.u32   q11, q8, q10
+  vpaddl.u32 q1, q11
+  vadd.u64   d0, d2, d3
+  vmov.32    r0, d0[0]
+  vpop			 {q8, q9, q10, q11}
+  vpop			 {q0, q1, q2, q3}
+  bx				 lr
+  ENDP
+
+  END
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 49aa3b4..3148260 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -21,6 +21,15 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)

+  /* !!! IMPORTANT: Following method has been ported to pure assembler to compare_neon.asm,
+   * because MS Visual Studio doesn't support inline assembler for ARM.
+   *
+   *  ALL CHANGES IN METHOD IMPLEMENTATION HAS TO BE DONE ALSO IN compare_neon.asm
+   *
+   * Eventually, only pure assembler implementation should be used for all platforms
+   * to avoid code duplication.
+  */
+
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   volatile uint32 sse;
   asm volatile (
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index afb5d28..fb462be 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -29,6 +29,10 @@

 #include "libyuv/basic_types.h"  // For CPU_X86

+#if defined(WINWUP) && defined(_M_ARM)
+  #include <windows.h>
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -317,6 +321,13 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
     cpu_info &= ~kCpuHasNEON;
   }
 #endif  // __arm__
+#if defined (WINUWP) && defined(_M_ARM)
+  // Windows Runtime on ARM
+  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+  {
+    cpu_info_ = kCpuHasNEON;
+  }
+#endif
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
     cpu_info = 0;
   }
diff --git a/source/rotate_neon.asm b/source/rotate_neon.asm
new file mode 100644
index 0000000..4d53eed
--- /dev/null
+++ b/source/rotate_neon.asm
@@ -0,0 +1,522 @@
+;
+;  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS. All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+  AREA  |.text|, CODE, READONLY, ALIGN=2
+
+  GET    source/arm_asm_macros.in
+
+  EXPORT TransposeWx8_NEON
+  EXPORT TransposeUVWx8_NEON
+
+kVTbl4x4Transpose	DCB	0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+kVTbl4x4TransposeDi DCB 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+
+TransposeWx8_NEON PROC
+  ; input
+  ;		r0 = uint8* src
+  ;		r1 = int src_stride
+  ;		r2 = uint8* dst
+  ;		r3 = int dst_stride
+
+  push      {r4-r6}
+  ldr       r4, [sp, #12] ; load parameter int width
+  adr       R6, kVTbl4x4Transpose
+  vpush     {q0, q1, q2, q3}
+
+  ; loops are on blocks of 8. loop will stop when
+  ; counter gets to or below 0. starting the counter
+  ; at w-8 allow for this
+  sub         r4, #8
+
+  ; handle 8x8 blocks. this should be the majority of the plane
+1
+  mov         r5, r0
+
+  MEMACCESS		0
+  vld1.8      {d0}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d1}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d2}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d3}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d4}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d5}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d6}, [r5], r1
+  MEMACCESS     0
+  vld1.8      {d7}, [r5]
+
+  vtrn.8      d1, d0
+  vtrn.8      d3, d2
+  vtrn.8      d5, d4
+  vtrn.8      d7, d6
+
+  vtrn.16     d1, d3
+  vtrn.16     d0, d2
+  vtrn.16     d5, d7
+  vtrn.16     d4, d6
+
+  vtrn.32     d1, d5
+  vtrn.32     d0, d4
+  vtrn.32     d3, d7
+  vtrn.32     d2, d6
+
+  vrev16.8    q0, q0
+  vrev16.8    q1, q1
+  vrev16.8    q2, q2
+  vrev16.8    q3, q3
+
+  mov         r5, r2
+
+  MEMACCESS     0
+  vst1.8      {d1}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d0}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d3}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d2}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d5}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d4}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d7}, [r5], r3
+  MEMACCESS     0
+  vst1.8      {d6}, [r5]
+
+  add         r0, #8              ; src += 8
+  add         r2, r2, r3, lsl #3  ; dst += 8 * dst_stride
+  subs        r4,  #8             ;   -= 8
+  bge         %b1
+
+  ; add 8 back to counter. if the result is 0 there are
+  ; no residuals.
+  adds        r4, #8
+  beq         %f4
+
+  ; some residual, so between 1 and 7 lines left to transpose
+  cmp         r4, #2
+  blt         %f3
+
+  cmp         r4, #4
+  blt         %f2
+
+  ; 4x8 block
+  mov         r5, r0
+  MEMACCESS     0
+  vld1.32     {d0[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d0[1]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d1[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d1[1]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d2[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d2[1]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d3[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.32     {d3[1]}, [r5]
+
+  mov         r5, r2
+
+  MEMACCESS(6)
+  vld1.8      {q3}, [r6]
+
+  vtbl.8      d4, {d0, d1}, d6
+  vtbl.8      d5, {d0, d1}, d7
+  vtbl.8      d0, {d2, d3}, d6
+  vtbl.8      d1, {d2, d3}, d7
+
+  ; TODO(frkoenig): Rework shuffle above to
+  ; write out with 4 instead of 8 writes.
+  MEMACCESS     0
+  vst1.32     {d4[0]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d4[1]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d5[0]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d5[1]}, [r5]
+
+  add         r5, r2, #4
+  MEMACCESS     0
+  vst1.32     {d0[0]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d0[1]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d1[0]}, [r5], r3
+  MEMACCESS     0
+  vst1.32     {d1[1]}, [r5]
+
+  add         r0, #4              ; src += 4
+  add         r2, r2, r3, lsl #2  ; dst += 4 * dst_stride
+  subs        r4,  #4             ; w   -= 4
+  beq         %f4
+
+  ; some residual, check to see if it includes a 2x8 block,
+  ; or less
+  cmp         r4, #2
+  blt         %f3
+
+  ; 2x8 block
+2
+  mov         r5, r0
+  MEMACCESS     0
+  vld1.16     {d0[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d1[0]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d0[1]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d1[1]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d0[2]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d1[2]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d0[3]}, [r5], r1
+  MEMACCESS     0
+  vld1.16     {d1[3]}, [r5]
+
+  vtrn.8      d0, d1
+
+  mov         r5, r2
+
+  MEMACCESS     0
+  vst1.64     {d0}, [r5], r3
+  MEMACCESS     0
+  vst1.64     {d1}, [r5]
+
+  add         r0, #2               ; src += 2
+  add         r2, r2, r3, lsl #1   ; dst += 2 * dst_stride
+  subs        r4,  #2              ; w   -= 2
+  beq         %f4
+
+  ; 1x8 block
+3
+  MEMACCESS    1
+  vld1.8      {d0[0]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[1]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[2]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[3]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[4]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[5]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[6]}, [r0], r1
+  MEMACCESS    1
+  vld1.8      {d0[7]}, [r0]
+
+  MEMACCESS(3)
+  vst1.64     {d0}, [r2]
+
+4
+  vpop        {q0, q1, q2, q3}
+  pop         {r4-r6}
+  bx          lr
+  ENDP
+
+TransposeUVWx8_NEON PROC
+  ; input
+  ;		r0 = uint8* src
+  ;		r1 = int src_stride
+  ;		r2 = uint8* dst_a
+  ;		r3 = int dst_stride_a
+  push      {r4-r8}
+  ldr       r5, [sp, #20] ; load uint8* dst_b
+  ldr       r6, [sp, #24] ; int dst_stride_b
+  ldr       r7, [sp, #28] ; int width
+  adr       R8, kVTbl4x4TransposeDi
+  vpush     {q0, q1, q2, q3}
+  vpush     {q8, q9, q10, q11}
+
+  ; loops are on blocks of 8. loop will stop when
+  ; counter gets to or below 0. starting the counter
+  ; at w-8 allow for this
+  sub         r7, #8
+
+  ; handle 8x8 blocks. this should be the majority of the plane
+1
+  mov         r4, r0
+
+  MEMACCESS   0
+  vld2.8      {d0,  d1},  [r4], r1
+  MEMACCESS   0
+  vld2.8      {d2,  d3},  [r4], r1
+  MEMACCESS   0
+  vld2.8      {d4,  d5},  [r4], r1
+  MEMACCESS   0
+  vld2.8      {d6,  d7},  [r4], r1
+  MEMACCESS   0
+  vld2.8      {d16, d17}, [r4], r1
+  MEMACCESS   0
+  vld2.8      {d18, d19}, [r4], r1
+  MEMACCESS   0
+  vld2.8      {d20, d21}, [r4], r1
+  MEMACCESS   0
+  vld2.8      {d22, d23}, [r4]
+
+  vtrn.8      q1, q0
+  vtrn.8      q3, q2
+  vtrn.8      q9, q8
+  vtrn.8      q11, q10
+
+  vtrn.16     q1, q3
+  vtrn.16     q0, q2
+  vtrn.16     q9, q11
+  vtrn.16     q8, q10
+
+  vtrn.32     q1, q9
+  vtrn.32     q0, q8
+  vtrn.32     q3, q11
+  vtrn.32     q2, q10
+
+  vrev16.8    q0, q0
+  vrev16.8    q1, q1
+  vrev16.8    q2, q2
+  vrev16.8    q3, q3
+  vrev16.8    q8, q8
+  vrev16.8    q9, q9
+  vrev16.8    q10, q10
+  vrev16.8    q11, q11
+
+  mov         r4, r2
+
+  MEMACCESS   0
+  vst1.8      {d2},  [r4], r3
+  MEMACCESS   0
+  vst1.8      {d0},  [r4], r3
+  MEMACCESS   0
+  vst1.8      {d6},  [r4], r3
+  MEMACCESS   0
+  vst1.8      {d4},  [r4], r3
+  MEMACCESS   0
+  vst1.8      {d18}, [r4], r3
+  MEMACCESS   0
+  vst1.8      {d16}, [r4], r3
+  MEMACCESS   0
+  vst1.8      {d22}, [r4], r3
+  MEMACCESS   0
+  vst1.8      {d20}, [r4]
+
+  mov         r4, r5
+
+  MEMACCESS   0
+  vst1.8      {d3},  [r4], r6
+  MEMACCESS   0
+  vst1.8      {d1},  [r4], r6
+  MEMACCESS   0
+  vst1.8      {d7},  [r4], r6
+  MEMACCESS   0
+  vst1.8      {d5},  [r4], r6
+  MEMACCESS   0
+  vst1.8      {d19}, [r4], r6
+  MEMACCESS   0
+  vst1.8      {d17}, [r4], r6
+  MEMACCESS   0
+  vst1.8      {d23}, [r4], r6
+  MEMACCESS   0
+  vst1.8      {d21}, [r4]
+
+  add         r0, #8*2                      ; src   += 8*2
+  add         r2, r2, r3, lsl #3            ; dst_a += 8 * dst_stride_a
+  add         r5, r5, r6, lsl #3            ; dst_b += 8 * dst_stride_b
+  subs        r7,  #8                       ; w     -= 8
+  bge         %b1
+
+  ; add 8 back to counter. if the result is 0 there are
+  ; no residuals.
+  adds        r7, #8
+  beq         %f4
+
+  ; some residual, so between 1 and 7 lines left to transpose
+  cmp         r7, #2
+  blt         %f3
+
+  cmp         r7, #4
+  blt         %f2
+
+  ; TODO(frkoenig): Clean this up
+  ; 4x8 block
+  mov         r4, r0
+  MEMACCESS   0
+  vld1.64     {d0}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d1}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d2}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d3}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d4}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d5}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d6}, [r4], r1
+  MEMACCESS   0
+  vld1.64     {d7}, [r4]
+
+  MEMACCESS		8
+  vld1.8      {q15}, [r8]
+
+  vtrn.8      q0, q1
+  vtrn.8      q2, q3
+
+  vtbl.8      d16, {d0, d1}, d30
+  vtbl.8      d17, {d0, d1}, d31
+  vtbl.8      d18, {d2, d3}, d30
+  vtbl.8      d19, {d2, d3}, d31
+  vtbl.8      d20, {d4, d5}, d30
+  vtbl.8      d21, {d4, d5}, d31
+  vtbl.8      d22, {d6, d7}, d30
+  vtbl.8      d23, {d6, d7}, d31
+
+  mov         r4, r2
+
+  MEMACCESS   0
+  vst1.32     {d16[0]},  [r4], r3
+  MEMACCESS   0
+  vst1.32     {d16[1]},  [r4], r3
+  MEMACCESS   0
+  vst1.32     {d17[0]},  [r4], r3
+  MEMACCESS   0
+  vst1.32     {d17[1]},  [r4], r3
+
+  add         r4, r2, #4
+  MEMACCESS   0
+  vst1.32     {d20[0]}, [r4], r3
+  MEMACCESS   0
+  vst1.32     {d20[1]}, [r4], r3
+  MEMACCESS   0
+  vst1.32     {d21[0]}, [r4], r3
+  MEMACCESS   0
+  vst1.32     {d21[1]}, [r4]
+
+  mov         r4, r5
+
+  MEMACCESS   0
+  vst1.32     {d18[0]}, [r4], r6
+  MEMACCESS   0
+  vst1.32     {d18[1]}, [r4], r6
+  MEMACCESS   0
+  vst1.32     {d19[0]}, [r4], r6
+  MEMACCESS   0
+  vst1.32     {d19[1]}, [r4], r6
+
+  add         r4, r5, #4
+  MEMACCESS   0
+  vst1.32     {d22[0]},  [r4], r6
+  MEMACCESS   0
+  vst1.32     {d22[1]},  [r4], r6
+  MEMACCESS   0
+  vst1.32     {d23[0]},  [r4], r6
+  MEMACCESS   0
+  vst1.32     {d23[1]},  [r4]
+
+  add         r0, #4*2                        ; src   += 4 * 2
+  add         r2, r2, r3, lsl #2              ; dst_a += 4 * dst_stride_a
+  add         r5, r5, r6, lsl #2              ; dst_b += 4 * dst_stride_b
+  subs        r7,  #4                         ; w     -= 4
+  beq         %f4
+
+  ; some residual, check to see if it includes a 2x8 block,
+  ; or less
+  cmp         r7, #2
+  blt         %f3
+
+  ; 2x8 block
+2
+  mov         r4, r0
+  MEMACCESS   0
+  vld2.16     {d0[0], d2[0]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d1[0], d3[0]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d0[1], d2[1]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d1[1], d3[1]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d0[2], d2[2]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d1[2], d3[2]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d0[3], d2[3]}, [r4], r1
+  MEMACCESS   0
+  vld2.16     {d1[3], d3[3]}, [r4]
+
+  vtrn.8      d0, d1
+  vtrn.8      d2, d3
+
+  mov         r4, r2
+
+  MEMACCESS   0
+  vst1.64     {d0}, [r4], r3
+  MEMACCESS   0
+  vst1.64     {d2}, [r4]
+
+  mov         r4, r5
+
+  MEMACCESS   0
+  vst1.64     {d1}, [r4], r6
+  MEMACCESS   0
+  vst1.64     {d3}, [r4]
+
+  add         r0, #2*2                        ; src   += 2 * 2
+  add         r2, r2, r3, lsl #1              ; dst_a += 2 * dst_stride_a
+  add         r5, r5, r6, lsl #1              ; dst_b += 2 * dst_stride_b
+  subs        r7,  #2                         ; w     -= 2
+  beq         %f4
+
+  ; 1x8 block
+3
+  MEMACCESS    1
+  vld2.8      {d0[0], d1[0]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[1], d1[1]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[2], d1[2]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[3], d1[3]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[4], d1[4]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[5], d1[5]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[6], d1[6]}, [r0], r1
+  MEMACCESS    1
+  vld2.8      {d0[7], d1[7]}, [r0]
+
+  MEMACCESS(3)
+  vst1.64     {d0}, [r2]
+  MEMACCESS(5)
+  vst1.64     {d1}, [r5]
+4
+
+  vpop        {q8, q9, q10, q11}
+  vpop        {q0, q1, q2, q3}
+  pop         {r4-r8}
+  bx          lr
+  ENDP
+
+  END
+
+
+
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 41ec34e..5c6f938 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -18,6 +18,15 @@ namespace libyuv {
 extern "C" {
 #endif

+/* !!! IMPORTANT: Following methods has been ported to pure assembler to rotate_neon.asm,
+* because MS Visual Studio doesn't support inline assembler for ARM.
+*
+*  ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN rotate_neon.asm
+*
+* Eventually, only pure assembler implementation should be used for all platforms
+* to avoid code duplication.
+*/
+
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)

diff --git a/source/row_neon.asm b/source/row_neon.asm
new file mode 100644
index 0000000..b623358
--- /dev/null
+++ b/source/row_neon.asm
@@ -0,0 +1,3435 @@
+;
+;  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS. All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+  AREA  |.text|, CODE, READONLY, ALIGN=2
+
+  GET    source/arm_asm_macros.in
+
+  EXPORT I444ToARGBRow_NEON
+  EXPORT I422ToARGBRow_NEON
+  EXPORT I411ToARGBRow_NEON
+  EXPORT I422ToBGRARow_NEON
+  EXPORT I422ToABGRRow_NEON
+  EXPORT I422AlphaToARGBRow_NEON
+  EXPORT I422ToRGB24Row_NEON
+  EXPORT I422ToRAWRow_NEON
+  EXPORT I422ToRGBARow_NEON
+  EXPORT I422ToARGB4444Row_NEON
+  EXPORT I422ToARGB1555Row_NEON
+  EXPORT I422ToRGB565Row_NEON
+  EXPORT I400ToARGBRow_NEON
+  EXPORT J400ToARGBRow_NEON
+  EXPORT ARGBToRGB24Row_NEON
+  EXPORT RAWToRGB24Row_NEON
+  EXPORT ARGBToRAWRow_NEON
+  EXPORT ARGBToRGB565Row_NEON
+  EXPORT ARGBToARGB1555Row_NEON
+  EXPORT YUY2ToARGBRow_NEON
+  EXPORT UYVYToARGBRow_NEON
+  EXPORT ARGBToARGB4444Row_NEON
+  EXPORT NV12ToARGBRow_NEON
+  EXPORT NV21ToARGBRow_NEON
+  EXPORT NV12ToRGB565Row_NEON
+  EXPORT NV21ToRGB565Row_NEON
+  EXPORT SplitUVRow_NEON
+  EXPORT MergeUVRow_NEON
+  EXPORT SetRow_NEON
+  EXPORT CopyRow_NEON
+  EXPORT ARGBSetRow_NEON
+  EXPORT MirrorRow_NEON
+  EXPORT MirrorUVRow_NEON
+  EXPORT ARGBMirrorRow_NEON
+  EXPORT RGB24ToARGBRow_NEON
+  EXPORT RAWToARGBRow_NEON
+  EXPORT RGB565ToARGBRow_NEON
+  EXPORT ARGB1555ToARGBRow_NEON
+  EXPORT ARGB4444ToARGBRow_NEON
+  EXPORT RGBAToUVRow_NEON
+  EXPORT ABGRToUVRow_NEON
+  EXPORT ABGRToYRow_NEON
+  EXPORT RGBAToYRow_NEON
+  EXPORT RGB24ToYRow_NEON
+  EXPORT ARGB1555ToUVRow_NEON
+  EXPORT ARGB4444ToUVRow_NEON
+  EXPORT RGB565ToYRow_NEON
+  EXPORT RGB565ToUVRow_NEON
+  EXPORT ARGB1555ToYRow_NEON
+  EXPORT ARGB4444ToYRow_NEON
+  EXPORT BGRAToYRow_NEON
+  EXPORT ARGBToUV411Row_NEON
+  EXPORT ARGBToUV422Row_NEON
+  EXPORT ARGBToUV444Row_NEON
+  EXPORT YUY2ToUV422Row_NEON
+  EXPORT UYVYToUV422Row_NEON
+  EXPORT ARGBToBayerGGRow_NEON
+  EXPORT ARGBShuffleRow_NEON
+  EXPORT ARGBToUVJRow_NEON
+  EXPORT BGRAToUVRow_NEON
+  EXPORT ABGRToUVRow_NEON
+  EXPORT RGBAToUVRow_NEON
+  EXPORT ARGBExtractAlphaRow_NEON
+  EXPORT ARGBToYJRow_NEON
+  EXPORT I422ToUYVYRow_NEON
+  EXPORT I422ToYUY2Row_NEON
+  EXPORT ARGBToUVRow_NEON
+  EXPORT ARGBToYRow_NEON
+  EXPORT RAWToUVRow_NEON
+  EXPORT RAWToYRow_NEON
+  EXPORT RGB24ToUVRow_NEON
+  EXPORT UYVYToUVRow_NEON
+  EXPORT UYVYToYRow_NEON
+  EXPORT ARGBToRGB565DitherRow_NEON
+  EXPORT YUY2ToYRow_NEON
+  EXPORT YUY2ToUVRow_NEON
+  EXPORT SobelToPlaneRow_NEON
+  EXPORT SobelRow_NEON
+  EXPORT ARGBSubtractRow_NEON
+  EXPORT ARGBAddRow_NEON
+  EXPORT ARGBAttenuateRow_NEON
+  EXPORT ARGBQuantizeRow_NEON
+  EXPORT ARGBShadeRow_NEON
+  EXPORT ARGBGrayRow_NEON
+  EXPORT ARGBSepiaRow_NEON
+  EXPORT ARGBColorMatrixRow_NEON
+  EXPORT ARGBBlendRow_NEON
+  EXPORT InterpolateRow_NEON
+  EXPORT ARGBMultiplyRow_NEON
+  EXPORT SobelXRow_NEON
+  EXPORT SobelYRow_NEON
+  EXPORT SobelXYRow_NEON
+
+
+; ------- CONSTANTS ---------------------
+
+; YUV to RGB conversion constants.
+; Y contribution to R,G,B.  Scale and bias.
+YG  EQU   18997 ; round(1.164 * 64 * 256 * 256 / 257)
+YGB EQU   1160  ; 1.164 * 64 * 16 - adjusted for even error distribution
+
+; U and V contributions to R,G,B
+UB  EQU   -128 ; -min(128, round(2.018 * 64))
+UG  EQU   25   ; -round(-0.391 * 64)
+VG  EQU   52   ; -round(-0.813 * 64)
+VR  EQU   -102 ; -round(1.596 * 64)
+
+; Bias values to subtract 16 from Y and 128 from U and V.
+BB  EQU  UB * 128 - YGB
+BG  EQU  UG * 128 + VG * 128 - YGB
+BR  EQU  VR * 128 - YGB
+
+
+; ------- ARRAYS ------------------------
+
+kUVToRB     DCB   128, 128, 128, 128, 102, 102, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0
+kUVToG      DCB   25, 25, 25, 25, 52, 52, 52, 52, 0, 0, 0, 0, 0, 0, 0, 0
+kUVBiasBGR  DCW   BB, BG, BR, 0, 0, 0, 0, 0
+kYToRgb     DCD   0x0101 * YG, 0, 0, 0
+
+; ------- MACROS ------------------------
+
+  MACRO
+  YUV422TORGB_SETUP_REG
+    adr        r5, kUVToRB
+    vld1.8     {d24}, [r5]
+    adr        r5, kUVToG
+    vld1.8     {d25}, [r5]
+    adr        r5, kUVBiasBGR
+    vld1.16    {d26[], d27[]}, [r5]!
+    vld1.16    {d8[], d9[]}, [r5]!
+    vld1.16    {d28[], d29[]}, [r5]
+    adr        r5, kYToRgb
+    vld1.32    {d30[], d31[]}, [r5]
+  MEND
+
+  ; Read 8 Y, 4 U and 4 V from 422
+  MACRO
+  READYUV422
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    MEMACCESS	1
+    vld1.32    {d2[0]}, [r1]!
+    MEMACCESS 2
+    vld1.32    {d2[1]}, [r2]!
+  MEND
+
+  ; Read 8 Y, 2 U and 2 V from 422
+  MACRO
+  READYUV411
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    MEMACCESS	1
+    vld1.16    {d2[0]}, [r1]!
+    MEMACCESS 2
+    vld1.16    {d2[1]}, [r2]!
+    vmov.u8    d3, d2
+    vzip.u8    d2, d3
+  MEND
+
+  ; Read 8 Y, 8 U and 8 V from 444
+  MACRO
+  READYUV444
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    MEMACCESS	1
+    vld1.8     {d2}, [r1]!
+    MEMACCESS 2
+    vld1.8     {d3}, [r2]!
+    vpaddl.u8  q1, q1
+    vrshrn.u16 d2, q1, #1
+  MEND
+
+  ; Read 8 Y and 4 VU from NV21
+  MACRO
+  READNV21
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    MEMACCESS	1
+    vld1.8     {d2}, [r1]!
+    vmov.u8    d3, d2                         ; split odd/even uv apart
+    vuzp.u8    d3, d2
+    vtrn.u32   d2, d3
+  MEND
+
+  ; Read 8 Y, and set 4 U and 4 V to 128
+  MACRO
+  READYUV400
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    vmov.u8    d2, #128
+  MEND
+
+  ; Read 8 Y and 4 UV from NV12
+  MACRO
+  READNV12
+    MEMACCESS	0
+    vld1.8     {d0}, [r0]!
+    MEMACCESS	1
+    vld1.8     {d2}, [r1]!
+    vmov.u8    d3, d2                         ; split odd/even uv apart
+    vuzp.u8    d2, d3
+    vtrn.u32   d2, d3
+  MEND
+
+  ; Read 8 YUY2
+  MACRO
+  READYUY2
+    MEMACCESS	0
+    vld2.8     {d0, d2}, [r0]!
+    vmov.u8    d3, d2
+    vuzp.u8    d2, d3
+    vtrn.u32   d2, d3
+  MEND
+
+  ; Read 8 UYVY
+  MACRO
+  READUYVY
+    MEMACCESS	0
+    vld2.8     {d2, d3}, [r0]!
+    vmov.u8    d0, d3
+    vmov.u8    d3, d2
+    vuzp.u8    d2, d3
+    vtrn.u32   d2, d3
+  MEND
+
+  MACRO
+  ARGBTOARGB4444
+    vshr.u8    d20, d20, #4                     ; B
+    vbic.32    d21, d21, d4                     ; G
+    vshr.u8    d22, d22, #4                     ; R
+    vbic.32    d23, d23, d4                     ; A
+    vorr       d0, d20, d21                     ; BG
+    vorr       d1, d22, d23                     ; RA
+    vzip.u8    d0, d1                           ; BGRA
+  MEND
+
+  MACRO
+  ARGBTOARGB1555
+    vshll.u8    q0, d23, #8                     ; A
+    vshll.u8    q8, d22, #8                     ; R
+    vshll.u8    q9, d21, #8                     ; G
+    vshll.u8    q10, d20, #8                    ; B
+    vsri.16     q0, q8, #1                      ; AR
+    vsri.16     q0, q9, #6                      ; ARG
+    vsri.16     q0, q10, #11                    ; ARGB
+  MEND
+
+  MACRO
+  ARGBTORGB565
+    vshll.u8    q0, d22, #8                     ; R
+    vshll.u8    q8, d21, #8                     ; G
+    vshll.u8    q9, d20, #8                     ; B
+    vsri.16     q0, q8, #5                      ; RG
+    vsri.16     q0, q9, #11                     ; RGB
+  MEND
+
+  MACRO
+  YUV422TORGB
+    vmull.u8   q8, d2, d24                     ; u/v B/R component
+    vmull.u8   q9, d2, d25                     ; u/v G component
+    vmovl.u8   q0, d0                          ; Y
+    vmovl.s16  q10, d1
+    vmovl.s16  q0, d0
+    vmul.s32   q10, q10, q15
+    vmul.s32   q0, q0, q15
+    vqshrun.s32 d0, q0, #16
+    vqshrun.s32 d1, q10, #16                   ; Y
+    vadd.s16   d18, d19
+    vshll.u16  q1, d16, #16                    ; Replicate u * UB
+    vshll.u16  q10, d17, #16                   ; Replicate v * VR
+    vshll.u16  q3, d18, #16                    ; Replicate (v*VG + u*UG)
+    vaddw.u16  q1, q1, d16
+    vaddw.u16  q10, q10, d17
+    vaddw.u16  q3, q3, d18
+    vqadd.s16  q8, q0, q13                     ; B */
+    vqadd.s16  q9, q0, q14                     ; R */
+    vqadd.s16  q0, q0, q4                      ; G */
+    vqadd.s16  q8, q8, q1                      ; B */
+    vqadd.s16  q9, q9, q10                     ; R */
+    vqsub.s16  q0, q0, q3                      ; G */
+    vqshrun.s16 d20, q8, #6                    ; B */
+    vqshrun.s16 d22, q9, #6                    ; R */
+    vqshrun.s16 d21, q0, #6                    ; G */
+  MEND
+
+  MACRO
+  RGB565TOARGB
+    vshrn.u16  d6, q0, #5                       ; G xxGGGGGG
+    vuzp.u8    d0, d1                           ; d0 xxxBBBBB RRRRRxxx
+    vshl.u8    d6, d6, #2                       ; G GGGGGG00 upper 6
+    vshr.u8    d1, d1, #3                       ; R 000RRRRR lower 5
+    vshl.u8    q0, q0, #3                       ; B,R BBBBB000 upper 5
+    vshr.u8    q2, q0, #5                       ; B,R 00000BBB lower 3
+    vorr.u8    d0, d0, d4                       ; B
+    vshr.u8    d4, d6, #6                       ; G 000000GG lower 2
+    vorr.u8    d2, d1, d5                       ; R
+    vorr.u8    d1, d4, d6                       ; G
+  MEND
+
+  MACRO
+  ARGB1555TOARGB
+    vshrn.u16  d7, q0, #8                       ; A Arrrrrxx
+    vshr.u8    d6, d7, #2                       ; R xxxRRRRR
+    vshrn.u16  d5, q0, #5                       ; G xxxGGGGG
+    vmovn.u16  d4, q0                           ; B xxxBBBBB
+    vshr.u8    d7, d7, #7                       ; A 0000000A
+    vneg.s8    d7, d7                           ; A AAAAAAAA upper 8
+    vshl.u8    d6, d6, #3                       ; R RRRRR000 upper 5
+    vshr.u8    q1, q3, #5                       ; R,A 00000RRR lower 3
+    vshl.u8    q0, q2, #3                       ; B,G BBBBB000 upper 5
+    vshr.u8    q2, q0, #5                       ; B,G 00000BBB lower 3
+    vorr.u8    q1, q1, q3                       ; R,A
+    vorr.u8    q0, q0, q2                       ; B,G
+  MEND
+
+  MACRO
+  ARGB4444TOARGB
+    vuzp.u8    d0, d1                           ; d0 BG, d1 RA
+    vshl.u8    q2, q0, #4                       ; B,R BBBB0000
+    vshr.u8    q1, q0, #4                       ; G,A 0000GGGG
+    vshr.u8    q0, q2, #4                       ; B,R 0000BBBB
+    vorr.u8    q0, q0, q2                       ; B,R BBBBBBBB
+    vshl.u8    q2, q1, #4                       ; G,A GGGG0000
+    vorr.u8    q1, q1, q2                       ; G,A GGGGGGGG
+    vswp.u8    d1, d2                           ; B,R,G,A -> B,G,R,A
+  MEND
+
+  ; 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+  MACRO
+  RGBTOUV   $QB, $QG, $QR
+    vmul.s16    q8,  $QB , q10                 ; B
+    vmls.s16   q8,  $QG , q11                 ; G
+    vmls.s16   q8,  $QR , q12                 ; R
+    vadd.u16   q8, q8, q15                    ; +128 -> unsigned
+    vmul.s16   q9,  $QR , q10                 ; R
+    vmls.s16   q9,  $QG , q14                 ; G
+    vmls.s16   q9,  $QB , q13                 ; B
+    vadd.u16   q9, q9, q15                    ; +128 -> unsigned
+    vqshrn.u16  d0, q8, #8                    ; 16 bit to 8 bit U
+    vqshrn.u16  d1, q9, #8                    ; 16 bit to 8 bit V
+  MEND
+
+  ; RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+  MACRO
+  RGB555TOARGB
+    vshrn.u16  d6, q0, #5                       ; G xxxGGGGG
+    vuzp.u8    d0, d1                           ; d0 xxxBBBBB xRRRRRxx
+    vshl.u8    d6, d6, #3                       ; G GGGGG000 upper 5
+    vshr.u8    d1, d1, #2                       ; R 00xRRRRR lower 5
+    vshl.u8    q0, q0, #3                       ; B,R BBBBB000 upper 5
+    vshr.u8    q2, q0, #5                       ; B,R 00000BBB lower 3
+    vorr.u8    d0, d0, d4                       ; B
+    vshr.u8    d4, d6, #5                       ; G 00000GGG lower 3
+    vorr.u8    d2, d1, d5                       ; R
+    vorr.u8    d1, d4, d6                       ; G
+  MEND
+
+
+; ----- METHODS ---------------------------------------
+
+I444ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV444
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d23, #255
+  MEMACCESS  3
+  vst4.8     {d20, d21, d22, d23}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d23, #255
+  MEMACCESS  3
+  vst4.8     {d20, d21, d22, d23}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422AlphaToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 = const uint8* src_a
+  ;     r4 =  uint8* dst_argb
+  push      {r5, r6}
+  ldr       r5, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  MEMACCESS  3
+  vld1.8     {d23}, [r3]!
+  subs       r5, r5, #8
+  MEMACCESS  4
+  vst4.8     {d20, d21, d22, d23}, [r4]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5, r6}
+  bx        lr
+  ENDP
+
+I422ToRGBARow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d19, #255
+  MEMACCESS  3
+  vst4.8     {d19, d20, d21, d22}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+
+I411ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV411
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d23, #255
+  MEMACCESS  3
+  vst4.8     {d20, d21, d22, d23}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+
+I422ToBGRARow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_bgra
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vswp.u8    d20, d22
+  vmov.u8    d19, #255
+  MEMACCESS  3
+  vst4.8     {d19, d20, d21, d22}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+
+I422ToABGRRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_abgr
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vswp.u8    d20, d22
+  vmov.u8    d23, #255
+  MEMACCESS  3
+  vst4.8     {d20, d21, d22, d23}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToRGB24Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_rgb24
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  MEMACCESS  3
+  vst3.8     {d20, d21, d22}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToRAWRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_raw
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vswp.u8    d20, d22
+  MEMACCESS	3
+  vst3.8     {d20, d21, d22}, [r3]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToARGB4444Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb4444
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+  vmov.u8    d4, #0x0f                        ; bits to clear with vbic.
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d23, #255
+  ARGBTOARGB4444
+  MEMACCESS  3
+  vst1.8     {q0}, [r3]!                      ; store 8 pixels ARGB4444.
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToARGB1555Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_argb1555
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  vmov.u8    d23, #255
+  ARGBTOARGB1555
+  MEMACCESS  3
+  vst1.8     {q0}, [r3]!                    ; store 8 pixels ARGB1555.
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I422ToRGB565Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_rgb565
+  push      {r4, r5}
+  ldr       r4, [sp,#8]      ; int width
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV422
+  YUV422TORGB
+  subs       r4, r4, #8
+  ARGBTORGB565
+  MEMACCESS  3
+  vst1.8     {q0}, [r3]!                   ; store 8 pixels ARGB1555.
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r4, r5}
+  bx        lr
+  ENDP
+
+I400ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_rgb24
+  ;     r2 = width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUV400
+  YUV422TORGB
+  subs       r2, r2, #8
+  vmov.u8    d23, #255
+  MEMACCESS	1
+  vst4.8     {d20, d21, d22, d23}, [r1]!
+  bgt        %b1
+
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+J400ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = width
+  push      {r5}
+  vpush     {d20 - d23}
+
+  vmov.u8    d23, #255
+1
+  MEMACCESS	 0
+  vld1.8     {d20}, [r0]!
+  vmov       d21, d20
+  vmov       d22, d20
+  subs       r2, r2, #8
+  MEMACCESS	 1
+  vst4.8     {d20, d21, d22, d23}, [r1]!
+  bgt        %b1
+
+
+  vpop      {d20 - d23}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+
+ARGBToRGB24Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_raw
+  ;     r2 = pix
+  vpush     {d1 - d4}
+
+1
+  MEMACCESS	0
+  vld4.8     {d1, d2, d3, d4}, [r0]!          ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  MEMACCESS	1
+  vst3.8     {d1, d2, d3}, [r1]!              ; store 8 pixels of RGB24.
+  bgt        %b1
+
+  vpop      {d1 - d4}
+  bx        lr
+  ENDP
+
+ARGBToRAWRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = pix
+  vpush     {d1 - d4}
+
+1
+  MEMACCESS	0
+  vld4.8     {d1, d2, d3, d4}, [r0]!          ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vswp.u8    d1, d3                           ; swap R, B
+  MEMACCESS	1
+  vst3.8     {d1, d2, d3}, [r1]!              ; store 8 pixels of RAW.
+  bgt        %b1
+
+  vpop      {d1 - d4}
+  bx        lr
+  ENDP
+
+ARGBToRGB565Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_rgb565
+  ;     r2 = pix
+  vpush     {q0}
+  vpush     {q8 - q11}
+
+1
+  MEMACCESS	0
+  vld4.8     {d20, d21, d22, d23}, [r0]!      ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGBTORGB565
+  MEMACCESS	1
+  vst1.8     {q0}, [r1]!                      ; store 8 pixels RGB565.
+  bgt        %b1
+
+  vpop     {q8 - q11}
+  vpop     {q0}
+  bx        lr
+  ENDP
+
+ARGBToARGB1555Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb1555
+  ;     r2 = pix
+  vpush     {q0}
+  vpush     {q8 - q11}
+
+1
+  MEMACCESS	0
+  vld4.8     {d20, d21, d22, d23}, [r0]!      ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGBTOARGB1555
+  MEMACCESS	1
+  vst1.8     {q0}, [r1]!                      ; store 8 pixels ARGB1555.
+  bgt        %b1
+
+  vpop     {q8 - q11}
+  vpop     {q0}
+  bx        lr
+  ENDP
+
+ARGBToARGB4444Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb4444
+  ;     r2 = pix
+  vpush     {q0}
+  vpush     {q8 - q11}
+
+  vmov.u8    d4, #0x0f                        ; bits to clear with vbic.
+1
+  MEMACCESS	0
+  vld4.8     {d20, d21, d22, d23}, [r0]!      ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGBTOARGB4444
+  MEMACCESS	1
+  vst1.8     {q0}, [r1]!                      ; store 8 pixels ARGB4444.
+  bgt        %b1
+
+  vpop     {q8 - q11}
+  vpop     {q0}
+  bx        lr
+  ENDP
+
+NV12ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_uv
+  ;     r2 = uint8* dst_argb
+  ;     r3 = int width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+
+1
+  READNV12
+  YUV422TORGB
+  subs       r3, r3, #8
+  vmov.u8    d23, #255
+  MEMACCESS	2
+  vst4.8     {d20, d21, d22, d23}, [r2]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+NV21ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_uv
+  ;     r2 = uint8* dst_argb
+  ;     r3 = int width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+
+1
+  READNV21
+  YUV422TORGB
+  subs       r3, r3, #8
+  vmov.u8    d23, #255
+  MEMACCESS	2
+  vst4.8     {d20, d21, d22, d23}, [r2]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+NV12ToRGB565Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_uv
+  ;     r2 = uint8* dst_rgb565
+  ;     r3 = int width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+
+1
+  READNV12
+  YUV422TORGB
+  subs       r3, r3, #8
+  ARGBTORGB565
+  MEMACCESS	2
+  vst1.8     {q0}, [r2]!                      ; store 8 pixels RGB565.
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+NV21ToRGB565Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_uv
+  ;     r2 = uint8* dst_rgb565
+  ;     r3 = int width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+
+1
+  READNV21
+  YUV422TORGB
+  subs       r3, r3, #8
+  ARGBTORGB565
+  MEMACCESS	2
+  vst1.8     {q0}, [r2]!                      ; store 8 pixels RGB565.
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+YUY2ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_yuy2
+  ;     r1 = uint8* dst_argb
+  ;     r2 = width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READYUY2
+  YUV422TORGB
+  subs       r2, r2, #8
+  vmov.u8    d23, #255
+  MEMACCESS	1
+  vst4.8     {d20, d21, d22, d23}, [r1]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+UYVYToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uyvy
+  ;     r1 = uint8* dst_argb
+  ;     r2 = width
+  push      {r5}
+  vpush     {q0 - q4}
+  vpush     {q8 - q15}
+
+  YUV422TORGB_SETUP_REG
+1
+  READUYVY
+  YUV422TORGB
+  subs       r2, r2, #8
+  vmov.u8    d23, #255
+  MEMACCESS	1
+  vst4.8     {d20, d21, d22, d23}, [r1]!
+  bgt        %b1
+
+  vpop      {q8 - q15}
+  vpop      {q0 - q4}
+  pop       {r5}
+  bx        lr
+  ENDP
+
+; Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+SplitUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uv
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int width
+  vpush      {q0, q1}
+
+1
+  MEMACCESS	0
+  vld2.8     {q0, q1}, [r0]!                  ; load 16 pairs of UV
+  subs       r3, r3, #16                      ; 16 processed per loop
+  MEMACCESS	1
+  vst1.8     {q0}, [r1]!                      ; store U
+  MEMACCESS	2
+  vst1.8     {q1}, [r2]!                      ; store V
+  bgt        %b1
+
+  vpop       {q0, q1}
+  bx         lr
+  ENDP
+
+; Reads 16 U's and V's and writes out 16 pairs of UV
+MergeUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_u
+  ;     r1 = uint8* src_v
+  ;     r2 = uint8* dst_uv
+  ;     r3 = int width
+  vpush      {q0, q1}
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load U
+  MEMACCESS	1
+  vld1.8     {q1}, [r1]!                      ; load V
+  subs       r3, r3, #16                      ; 16 processed per loop
+  MEMACCESS	2
+  vst2.u8    {q0, q1}, [r2]!                  ; store 16 pairs of UV
+  bgt        %b1
+
+  vpop       {q0, q1}
+  bx         lr
+  ENDP
+
+; Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+CopyRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src
+  ;     r1 = uint8* dst
+  ;     r2 = int count
+  vpush      {q0, q1}
+
+1
+  MEMACCESS	0
+  vld1.8     {d0, d1, d2, d3}, [r0]!          ; load 32
+  subs       r2, r2, #32                      ; 32 processed per loop
+  MEMACCESS	1
+  vst1.8     {d0, d1, d2, d3}, [r1]!          ; store 32
+  bgt        %b1
+
+  vpop       {q0, q1}
+  bx         lr
+  ENDP
+
+; SetRow writes 'count' bytes using an 8 bit value repeated
+SetRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src
+  ;     r1 = uint8* v8
+  ;     r2 = int count
+  vpush     {q0}
+
+  vdup.8    q0, r1                            ; duplicate 16 bytes
+1
+  subs      r2, r2, #16                       ; 16 bytes per loop
+  MEMACCESS	0
+  vst1.8    {q0}, [r0]!                       ; store
+  bgt       %b1
+
+  vpop      {q0}
+  bx        lr
+  ENDP
+
+; ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+ARGBSetRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* dst
+  ;     r1 = uint8* v32
+  ;     r2 = int count
+  vpush     {q0}
+
+  vdup.u32  q0, r1                            ; duplicate 4 ints
+1
+  subs      r2, r2, #4                        ; 4 pixels per loop
+  MEMACCESS	0
+  vst1.8    {q0}, [r0]!                       ; store
+  bgt       %b1
+
+  vpop      {q0}
+  bx        lr
+  ENDP
+
+MirrorRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src
+  ;     r1 = uint8* dst
+  ;     r2 = int width
+  push      {r3}
+  vpush     {q0}
+  ; Start at end of source row.
+  mov        r3, #-16
+  add        r0, r0, r2
+  sub        r0, #16
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0], r3                   ; src -= 16
+  subs       r2, #16                          ; 16 pixels per loop.
+  vrev64.8   q0, q0
+  MEMACCESS	1
+  vst1.8     {d1}, [r1]!                      ; dst += 16
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!
+  bgt        %b1
+
+  vpop      {q0}
+  pop       {r3}
+  bx        lr
+  ENDP
+
+
+MirrorUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uv
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_
+  ;     r3 = uint8* width
+  push      {r12}
+  vpush     {q0}
+  ; Start at end of source row.
+  mov        r12, #-16
+  add        r0, r0, r3, lsl #1
+  sub        r0, #16
+
+1
+  MEMACCESS	0
+  vld2.8     {d0, d1}, [r0], r12              ; src -= 16
+  subs       r3, #8                           ; 8 pixels per loop.
+  vrev64.8   q0, q0
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; dst += 8
+  MEMACCESS	2
+  vst1.8     {d1}, [r2]!
+  bgt        %b1
+
+  vpop      {q0}
+  pop       {r12}
+  bx        lr
+  ENDP
+
+ARGBMirrorRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src
+  ;     r1 = uint8* dst
+  ;     r2 = int width
+  push      {r3}
+  vpush     {q0}
+
+  ; Start at end of source row.
+  mov        r3, #-16
+  add        r0, r0, r2, lsl #2
+  sub        r0, #16
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0], r3                   ; src -= 16
+  subs       r2, #4                           ; 4 pixels per loop.
+  vrev64.32  q0, q0
+  MEMACCESS	1
+  vst1.8     {d1}, [r1]!                      ; dst += 16
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!
+  bgt        %b1
+
+  vpop      {q0}
+  pop       {r3}
+  bx        lr
+  ENDP
+
+
+RGB24ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb24
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int pix
+  vpush       {d1 - d4}
+  vmov.u8    d4, #255                         ; Alpha
+
+1
+  MEMACCESS	0
+  vld3.8     {d1, d2, d3}, [r0]!              ; load 8 pixels of RGB24.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  MEMACCESS	1
+  vst4.8     {d1, d2, d3, d4}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop      {d1 - d4}
+  bx        lr
+  ENDP
+
+RAWToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_raw
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int pix
+  vpush       {d1 - d4}
+  vmov.u8    d4, #255                         ; Alpha
+
+1
+  MEMACCESS	0
+  vld3.8     {d1, d2, d3}, [r0]!              ; load 8 pixels of RAW.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vswp.u8    d1, d3                           ; swap R, B
+  MEMACCESS	1
+  vst4.8     {d1, d2, d3, d4}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop      {d1 - d4}
+  bx        lr
+  ENDP
+
+RAWToRGB24Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_raw
+  ;     r1 = uint8* dst_rgb24
+  ;     r2 = int width
+  vpush       {d1 - d4}
+
+1
+  MEMACCESS 0
+  vld3.8     {d1, d2, d3}, [r0]!              ; load 8 pixels of RAW.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vswp.u8    d1, d3                           ; swap R, B
+  MEMACCESS 1
+  vst3.8     {d1, d2, d3}, [r1]!              ; store 8 pixels of b g r.
+  bgt        %b1
+
+  vpop      {d1 - d4}
+  bx        lr
+  ENDP
+
+RGB565ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb565
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int pix
+  vpush	     {q0 - q3}
+  vmov.u8    d3, #255                         ; Alpha
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 RGB565 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  RGB565TOARGB
+  MEMACCESS	1
+  vst4.8     {d0, d1, d2, d3}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop	     {q0 - q3}
+   bx        lr
+  ENDP
+
+ARGB1555ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb1555
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int pix
+  vpush	     {q0 - q3}
+  vmov.u8    d3, #255                         ; Alpha
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB1555 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGB1555TOARGB
+  MEMACCESS	1
+  vst4.8     {d0, d1, d2, d3}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop	     {q0 - q3}
+  bx         lr
+  ENDP
+
+ARGB4444ToARGBRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb4444
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int pix
+  vpush	     {q0 - q2}
+  vmov.u8    d3, #255                         ; Alpha
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB4444 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGB4444TOARGB
+  MEMACCESS	1
+  vst4.8     {d0, d1, d2, d3}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop	    {q0 - q2}
+  bx        lr
+  ENDP
+
+ABGRToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_abgr
+  ;     r1 = uint8* src_stride_abgr
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int witdh
+  vpush      {q0 - q7}
+  vpush      {q7 - q14}
+  vpush      {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_abgr
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ABGR pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ABGR pixels.
+  vpaddl.u8  q2, q2                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q0, q0                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld4.8     {d8, d10, d12, d14}, [r1]!       ; load 8 more ABGR pixels.
+  MEMACCESS	1
+  vld4.8     {d9, d11, d13, d15}, [r1]!       ; load last 8 ABGR pixels.
+  vpadal.u8  q2, q6                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q0, q4                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q2, q1, q0
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop       {q15}
+  vpop       {q7 - q14}
+  vpop       {q0 - q7}
+  pop        {r4}
+  bx         lr
+  ENDP
+
+RGBAToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgba
+  ;     r1 = uint8* src_stride_rgba
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush      {q0 - q7}
+  vpush      {q7 - q14}
+  vpush      {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_rgba
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 RGBA pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 RGBA pixels.
+  vpaddl.u8  q0, q1                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q2                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q3                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld4.8     {d8, d10, d12, d14}, [r1]!       ; load 8 more RGBA pixels.
+  MEMACCESS	1
+  vld4.8     {d9, d11, d13, d15}, [r1]!       ; load last 8 RGBA pixels.
+  vpadal.u8  q0, q5                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q6                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q7                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q0, q1, q2
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop       {q15}
+  vpop       {q7 - q14}
+  vpop       {q0 - q7}
+  pop        {r4}
+  bx         lr
+  ENDP
+
+ABGRToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_abgr
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	   {d0 - d7}
+  vpush	   {q8}
+
+  vmov.u8    d4, #33                          ; R * 0.2578 coefficient
+  vmov.u8    d5, #65                          ; G * 0.5078 coefficient
+  vmov.u8    d6, #13                          ; B * 0.1016 coefficient
+  vmov.u8    d7, #16                          ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 pixels of ABGR.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q8, d0, d4                       ; R
+  vmlal.u8   q8, d1, d5                       ; G
+  vmlal.u8   q8, d2, d6                       ; B
+  vqrshrun.s16 d0, q8, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d7
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop	    {q8}
+  vpop	    {d0 - d7}
+  bx        lr
+  ENDP
+
+RGBAToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgba
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	   {d0 - d7}
+  vpush	   {q8}
+
+  vmov.u8    d4, #13                          ; B * 0.1016 coefficient
+  vmov.u8    d5, #65                          ; G * 0.5078 coefficient
+  vmov.u8    d6, #33                          ; R * 0.2578 coefficient
+  vmov.u8    d7, #16                          ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 pixels of RGBA.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q8, d1, d4                       ; B
+  vmlal.u8   q8, d2, d5                       ; G
+  vmlal.u8   q8, d3, d6                       ; R
+  vqrshrun.s16 d0, q8, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d7
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop	     {q8}
+  vpop	     {d0 - d7}
+  bx		     lr
+  ENDP
+
+RGB24ToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb24
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	    {d0 - d7}
+  vpush	    {q8}
+
+  vmov.u8    d4, #13                          ; B * 0.1016 coefficient
+  vmov.u8    d5, #65                          ; G * 0.5078 coefficient
+  vmov.u8    d6, #33                          ; R * 0.2578 coefficient
+  vmov.u8    d7, #16                          ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld3.8     {d0, d1, d2}, [r0]!              ; load 8 pixels of RGB24.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q8, d0, d4                       ; B
+  vmlal.u8   q8, d1, d5                       ; G
+  vmlal.u8   q8, d2, d6                       ; R
+  vqrshrun.s16 d0, q8, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d7
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop	     {q8}
+  vpop	     {d0 - d7}
+  bx		     lr
+  ENDP
+
+  ; 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+ARGB1555ToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb1555
+  ;     r1 = uint8* src_stride_argb1555
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_argb
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB1555 pixels.
+  RGB555TOARGB
+  vpaddl.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; next 8 ARGB1555 pixels.
+  RGB555TOARGB
+  vpaddl.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; load 8 ARGB1555 pixels.
+  RGB555TOARGB
+  vpadal.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; next 8 ARGB1555 pixels.
+  RGB555TOARGB
+  vpadal.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  vrshr.u16  q4, q4, #1                       ; 2x average
+  vrshr.u16  q5, q5, #1
+  vrshr.u16  q6, q6, #1
+
+  subs       r4, r4, #16                      ; 16 processed per loop.
+  vmul.s16   q8, q4, q10                      ; B
+  vmls.s16   q8, q5, q11                      ; G
+  vmls.s16   q8, q6, q12                      ; R
+  vadd.u16   q8, q8, q15                      ; +128 -> unsigned
+  vmul.s16   q9, q6, q10                      ; R
+  vmls.s16   q9, q5, q14                      ; G
+  vmls.s16   q9, q4, q13                      ; B
+  vadd.u16   q9, q9, q15                      ; +128 -> unsigned
+  vqshrn.u16  d0, q8, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q9, #8                      ; 16 bit to 8 bit V
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop	  	{q8 - q14}
+  vpop	    {q0 - q7}
+  pop		  	{r4}
+  bx		  	lr
+  ENDP
+
+
+; 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+ARGB4444ToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb4444
+  ;     r1 = uint8* src_stride_argb4444
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_argb
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB4444 pixels.
+  ARGB4444TOARGB
+  vpaddl.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; next 8 ARGB4444 pixels.
+  ARGB4444TOARGB
+  vpaddl.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; load 8 ARGB4444 pixels.
+  ARGB4444TOARGB
+  vpadal.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; next 8 ARGB4444 pixels.
+  ARGB4444TOARGB
+  vpadal.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  vrshr.u16  q4, q4, #1                       ; 2x average
+  vrshr.u16  q5, q5, #1
+  vrshr.u16  q6, q6, #1
+
+  subs       r4, r4, #16                      ; 16 processed per loop.
+  vmul.s16   q8, q4, q10                      ; B
+  vmls.s16   q8, q5, q11                      ; G
+  vmls.s16   q8, q6, q12                      ; R
+  vadd.u16   q8, q8, q15                      ; +128 -> unsigned
+  vmul.s16   q9, q6, q10                      ; R
+  vmls.s16   q9, q5, q14                      ; G
+  vmls.s16   q9, q4, q13                      ; B
+  vadd.u16   q9, q9, q15                      ; +128 -> unsigned
+  vqshrn.u16  d0, q8, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q9, #8                      ; 16 bit to 8 bit V
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop		  {q8 - q14}
+  vpop	    {q0 - q7}
+  pop		  	{r4}
+  bx		  	lr
+  ENDP
+
+RGB565ToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb565
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	     {q0 - q3}
+  vpush 	   {q12 - q13}
+
+  vmov.u8    d24, #13                         ; B * 0.1016 coefficient
+  vmov.u8    d25, #65                         ; G * 0.5078 coefficient
+  vmov.u8    d26, #33                         ; R * 0.2578 coefficient
+  vmov.u8    d27, #16                         ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 RGB565 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  RGB565TOARGB
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d27
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop  		{q12 - q13}
+  vpop	    {q0 - q3}
+  bx		  	lr
+  ENDP
+
+  ; 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+RGB565ToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb565
+  ;     r1 = uint8* src_stride_rgb565
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_argb
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 RGB565 pixels.
+  RGB565TOARGB
+  vpaddl.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; next 8 RGB565 pixels.
+  RGB565TOARGB
+  vpaddl.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpaddl.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpaddl.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; load 8 RGB565 pixels.
+  RGB565TOARGB
+  vpadal.u8  d8, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d10, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d12, d2                          ; R 8 bytes -> 4 shorts.
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!                      ; next 8 RGB565 pixels.
+  RGB565TOARGB
+  vpadal.u8  d9, d0                           ; B 8 bytes -> 4 shorts.
+  vpadal.u8  d11, d1                          ; G 8 bytes -> 4 shorts.
+  vpadal.u8  d13, d2                          ; R 8 bytes -> 4 shorts.
+
+  vrshr.u16  q4, q4, #1                       ; 2x average
+  vrshr.u16  q5, q5, #1
+  vrshr.u16  q6, q6, #1
+
+  subs       r4, r4, #16                      ; 16 processed per loop.
+  vmul.s16   q8, q4, q10                      ; B
+  vmls.s16   q8, q5, q11                      ; G
+  vmls.s16   q8, q6, q12                      ; R
+  vadd.u16   q8, q8, q15                      ; +128 -> unsigned
+  vmul.s16   q9, q6, q10                      ; R
+  vmls.s16   q9, q5, q14                      ; G
+  vmls.s16   q9, q4, q13                      ; B
+  vadd.u16   q9, q9, q15                      ; +128 -> unsigned
+  vqshrn.u16  d0, q8, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q9, #8                      ; 16 bit to 8 bit V
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop	  	{q8 - q14}
+  vpop	    {q0 - q7}
+  pop		  	{r4}
+  bx		  	lr
+  ENDP
+
+ARGB1555ToYRow_NEON PROC
+   ; input
+  ;     r0 = const uint8* src_argb1555
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	     {q0 - q3}
+  vpush	     {q12 - q13}
+
+  vmov.u8    d24, #13                         ; B * 0.1016 coefficient
+  vmov.u8    d25, #65                         ; G * 0.5078 coefficient
+  vmov.u8    d26, #33                         ; R * 0.2578 coefficient
+  vmov.u8    d27, #16                         ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB1555 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGB1555TOARGB
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d27
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop		  {q12 - q13}
+  vpop	  	{q0 - q3}
+  bx		  	lr
+  ENDP
+
+ARGB4444ToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb4444
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	     {q0 - q3}
+  vpush	     {q12 - q13}
+
+  vmov.u8    d24, #13                         ; B * 0.1016 coefficient
+  vmov.u8    d25, #65                         ; G * 0.5078 coefficient
+  vmov.u8    d26, #33                         ; R * 0.2578 coefficient
+  vmov.u8    d27, #16                         ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 8 ARGB4444 pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  ARGB4444TOARGB
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d27
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop		  {q12 - q13}
+  vpop	  	{q0 - q3}
+  bx		  	lr
+  ENDP
+
+BGRAToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_bgra
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	    {q0 - q3}
+  vpush	    {q12 - q13}
+
+  vmov.u8    d4, #33                          ; R * 0.2578 coefficient
+  vmov.u8    d5, #65                          ; G * 0.5078 coefficient
+  vmov.u8    d6, #13                          ; B * 0.1016 coefficient
+  vmov.u8    d7, #16                          ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 pixels of BGRA.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q8, d1, d4                       ; R
+  vmlal.u8   q8, d2, d5                       ; G
+  vmlal.u8   q8, d3, d6                       ; B
+  vqrshrun.s16 d0, q8, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d7
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop		  {q12 - q13}
+  vpop		  {q0 - q3}
+  bx			  lr
+  ENDP
+
+; 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+ARGBToUV411Row_NEON PROC
+    ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels.
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	0
+  vld4.8     {d8, d10, d12, d14}, [r0]!       ; load 8 more ARGB pixels.
+  MEMACCESS	0
+  vld4.8     {d9, d11, d13, d15}, [r0]!       ; load last 8 ARGB pixels.
+  vpaddl.u8  q4, q4                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q5, q5                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q6, q6                           ; R 16 bytes -> 8 shorts.
+
+  vpadd.u16  d0, d0, d1                       ; B 16 shorts -> 8 shorts.
+  vpadd.u16  d1, d8, d9                       ; B
+  vpadd.u16  d2, d2, d3                       ; G 16 shorts -> 8 shorts.
+  vpadd.u16  d3, d10, d11                     ; G
+  vpadd.u16  d4, d4, d5                       ; R 16 shorts -> 8 shorts.
+  vpadd.u16  d5, d12, d13                     ; R
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r3, r3, #32                      ; 32 processed per loop.
+  vmul.s16   q8, q0, q10                      ; B
+  vmls.s16   q8, q1, q11                      ; G
+  vmls.s16   q8, q2, q12                      ; R
+  vadd.u16   q8, q8, q15                      ; +128 -> unsigned
+  vmul.s16   q9, q2, q10                      ; R
+  vmls.s16   q9, q1, q14                      ; G
+  vmls.s16   q9, q0, q13                      ; B
+  vadd.u16   q9, q9, q15                      ; +128 -> unsigned
+  vqshrn.u16  d0, q8, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q9, #8                      ; 16 bit to 8 bit V
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels U.
+  MEMACCESS	2
+  vst1.8     {d1}, [r2]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop	  	{q8 - q14}
+  vpop	    {q0 - q7}
+  bx		  	lr
+  ENDP
+
+ARGBToUV422Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels.
+
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+
+  subs       r3, r3, #16                      ; 16 processed per loop.
+  vmul.s16   q8, q0, q10                      ; B
+  vmls.s16   q8, q1, q11                      ; G
+  vmls.s16   q8, q2, q12                      ; R
+  vadd.u16   q8, q8, q15                      ; +128 -> unsigned
+
+  vmul.s16   q9, q2, q10                      ; R
+  vmls.s16   q9, q1, q14                      ; G
+  vmls.s16   q9, q0, q13                      ; B
+  vadd.u16   q9, q9, q15                      ; +128 -> unsigned
+
+  vqshrn.u16  d0, q8, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q9, #8                      ; 16 bit to 8 bit V
+
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels U.
+  MEMACCESS	2
+  vst1.8     {d1}, [r2]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		   {q15}
+  vpop	     {q8 - q14}
+  vpop	     {q0 - q7}
+  bx		  	 lr
+  ENDP
+
+  ; 8x1 pixels.
+ARGBToUV444Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int pix
+  vpush	     {q0 - q4}
+  vpush 	   {q12 - q15}
+
+  vmov.u8    d24, #112                        ; UB / VR 0.875 coefficient
+  vmov.u8    d25, #74                         ; UG -0.5781 coefficient
+  vmov.u8    d26, #38                         ; UR -0.2969 coefficient
+  vmov.u8    d27, #18                         ; VB -0.1406 coefficient
+  vmov.u8    d28, #94                         ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vmull.u8   q2, d0, d24                      ; B
+  vmlsl.u8   q2, d1, d25                      ; G
+  vmlsl.u8   q2, d2, d26                      ; R
+  vadd.u16   q2, q2, q15                      ; +128 -> unsigned
+
+  vmull.u8   q3, d2, d24                      ; R
+  vmlsl.u8   q3, d1, d28                      ; G
+  vmlsl.u8   q3, d0, d27                      ; B
+  vadd.u16   q3, q3, q15                      ; +128 -> unsigned
+
+  vqshrn.u16  d0, q2, #8                      ; 16 bit to 8 bit U
+  vqshrn.u16  d1, q3, #8                      ; 16 bit to 8 bit V
+
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels U.
+  MEMACCESS	2
+  vst1.8     {d1}, [r2]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop	     {q12 - q15}
+  vpop	     {q0 - q4}
+  bx		  	 lr
+  ENDP
+
+YUY2ToUV422Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_yuy2
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int pix
+  vpush	     {d0 - d3}
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 16 pixels of YUY2.
+  subs       r3, r3, #16                      ; 16 pixels = 8 UVs.
+  MEMACCESS	1
+  vst1.8     {d1}, [r1]!                      ; store 8 U.
+  MEMACCESS	2
+  vst1.8     {d3}, [r2]!                      ; store 8 V.
+  bgt        %b1
+
+  vpop	     {d0 - d3}
+  bx		  	 lr
+  ENDP
+
+
+UYVYToUV422Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uyvy
+  ;     r1 = uint8* dst_u
+  ;     r2 = uint8* dst_v
+  ;     r3 = int pix
+  vpush	     {d0 - d3}
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 16 pixels of UYVY.
+  subs       r3, r3, #16                      ; 16 pixels = 8 UVs.
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 U.
+  MEMACCESS	2
+  vst1.8     {d2}, [r2]!                      ; store 8 V.
+  bgt        %b1
+
+  vpop	     {d0 - d3}
+  bx		  	 lr
+  ENDP
+
+  ; Select G channels from ARGB.  e.g.  GGGGGGGG
+ARGBToBayerGGRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_bayer
+  ;     r2 = uint32 selector
+  ;     r3 =  int pix
+  vpush			{q0, q1}
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load row 8 pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop
+  MEMACCESS	1
+  vst1.8     {d1}, [r1]!                      ; store 8 G's.
+  bgt        %b1
+
+  vpop		{q0, q1}
+  bx			lr
+  ENDP
+
+; For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+ARGBShuffleRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = const uint8* shuffler
+  ;     r3 = int pix
+  vpush			{q0 - q2}
+
+  MEMACCESS	3
+  vld1.8     {q2}, [r2]                       ; shuffler
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 4 pixels.
+  subs       r3, r3, #4                       ; 4 processed per loop
+  vtbl.8     d2, {d0, d1}, d4                 ; look up 2 first pixels
+  vtbl.8     d3, {d0, d1}, d5                 ; look up 2 next pixels
+  MEMACCESS	1
+  vst1.8     {q1}, [r1]!                      ; store 4.
+  bgt        %b1
+
+  vpop		{q0 - q2}
+  bx			lr
+  ENDP
+
+  ; TODO(fbarchard): Subsample match C code.
+ARGBToUVJRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = int src_stride_argb
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_argb
+  vmov.s16   q10, #127 / 2                    ; UB / VR 0.500 coefficient
+  vmov.s16   q11, #84 / 2                     ; UG -0.33126 coefficient
+  vmov.s16   q12, #43 / 2                     ; UR -0.16874 coefficient
+  vmov.s16   q13, #20 / 2                     ; VB -0.08131 coefficient
+  vmov.s16   q14, #107 / 2                    ; VG -0.41869 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels.
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld4.8     {d8, d10, d12, d14}, [r1]!       ; load 8 more ARGB pixels.
+  MEMACCESS	1
+  vld4.8     {d9, d11, d13, d15}, [r1]!       ; load last 8 ARGB pixels.
+  vpadal.u8  q0, q4                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q6                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q0, q1, q2
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop		  {q8 - q14}
+  vpop	    {q0 - q7}
+  pop		  	{r4}
+  bx		  	lr
+  ENDP
+
+
+BGRAToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_bgra
+  ;     r1 = int src_stride_bgra
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_bgra
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 BGRA pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 BGRA pixels.
+  vpaddl.u8  q3, q3                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld4.8     {d8, d10, d12, d14}, [r1]!       ; load 8 more BGRA pixels.
+  MEMACCESS	1
+  vld4.8     {d9, d11, d13, d15}, [r1]!       ; load last 8 BGRA pixels.
+  vpadal.u8  q3, q7                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q6                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q1, q1, #1                       ; 2x average
+  vrshr.u16  q2, q2, #1
+  vrshr.u16  q3, q3, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q3, q2, q1
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		  {q15}
+  vpop		  {q8 - q14}
+  vpop	    {q0 - q7}
+  pop		  	{r4}
+  bx		  	lr
+  ENDP
+
+ARGBExtractAlphaRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush      {q0 - q3}
+
+1
+  MEMACCESS 0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels
+  subs       r2, r2, #16                      ; 16 processed per loop
+  MEMACCESS 1
+  vst1.8     {q3}, [r1]!                      ; store 16 A's.
+  bgt        %b1
+
+  vpop      {q0 - q3}
+  bx        lr
+  ENDP
+
+ARGBToYJRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush      {q0 - q2}
+  vpush      {q12 - q13}
+
+  vmov.u8    d24, #15                         ; B * 0.11400 coefficient
+  vmov.u8    d25, #75                         ; G * 0.58700 coefficient
+  vmov.u8    d26, #38                         ; R * 0.29900 coefficient
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 15 bit to 8 bit Y
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop      {q12 - q13}
+  vpop      {q0 - q2}
+  bx        lr
+  ENDP
+
+I422ToYUY2Row_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_yuy2
+  push	    {r4}
+  ldr       r4, [sp,#4]                      ; int width
+  vpush 	  {d0 - d3}
+
+1
+  MEMACCESS	0
+  vld2.8     {d0, d2}, [r0]!                  ; load 16 Ys
+  MEMACCESS	1
+  vld1.8     {d1}, [r1]!                      ; load 8 Us
+  MEMACCESS	2
+  vld1.8     {d3}, [r2]!                      ; load 8 Vs
+  subs       r4, r4, #16                      ; 16 pixels
+  MEMACCESS	3
+  vst4.8     {d0, d1, d2, d3}, [r3]!          ; Store 8 YUY2/16 pixels.
+  bgt        %b1
+
+  vpop		  {d0 - d3}
+  pop		    {r4}
+  bx 		  	lr
+  ENDP
+
+I422ToUYVYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y
+  ;     r1 = const uint8* src_u
+  ;     r2 = const uint8* src_v
+  ;     r3 =  uint8* dst_uyvy
+  push	     {r4}
+  ldr        r4, [sp,#4]                      ; int width
+  vpush 	   {d0 - d3}
+
+1
+  MEMACCESS	0
+  vld2.8     {d1, d3}, [r0]!                  ; load 16 Ys
+  MEMACCESS	1
+  vld1.8     {d0}, [r1]!                      ; load 8 Us
+  MEMACCESS	2
+  vld1.8     {d2}, [r2]!                      ; load 8 Vs
+  subs       r4, r4, #16                      ; 16 pixels
+  MEMACCESS	3
+  vst4.8     {d0, d1, d2, d3}, [r3]!          ; Store 8 UYVY/16 pixels.
+  bgt        %b1
+
+  vpop		  {d0 - d3}
+  pop		    {r4}
+  bx 			  lr
+  ENDP
+
+  ; TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, ashr.
+ARGBToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = int src_stride_argb
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_argb
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels.
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld4.8     {d8, d10, d12, d14}, [r1]!       ; load 8 more ARGB pixels.
+  MEMACCESS	1
+  vld4.8     {d9, d11, d13, d15}, [r1]!       ; load last 8 ARGB pixels.
+  vpadal.u8  q0, q4                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q6                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q0, q1, q2
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		    {q15}
+  vpop		    {q8 - q14}
+  vpop	      {q0 - q7}
+  pop		    	{r4}
+  bx		    	lr
+  ENDP
+
+ARGBToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_bgra
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush	     {q0-q2}
+  vpush	     {q12,q13}
+
+  vmov.u8    d24, #13                         ; B * 0.1016 coefficient
+  vmov.u8    d25, #65                         ; G * 0.5078 coefficient
+  vmov.u8    d26, #33                         ; R * 0.2578 coefficient
+  vmov.u8    d27, #16                         ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d27
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop		   {q12, q13}
+  vpop		   {q0-q2}
+  bx	       lr
+  ENDP
+
+RAWToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_raw
+  ;     r1 = int src_stride_raw
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_raw
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld3.8     {d0, d2, d4}, [r0]!              ; load 8 RAW pixels.
+  MEMACCESS	0
+  vld3.8     {d1, d3, d5}, [r0]!              ; load next 8 RAW pixels.
+  vpaddl.u8  q2, q2                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q0, q0                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld3.8     {d8, d10, d12}, [r1]!            ; load 8 more RAW pixels.
+  MEMACCESS	1
+  vld3.8     {d9, d11, d13}, [r1]!            ; load last 8 RAW pixels.
+  vpadal.u8  q2, q6                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q0, q4                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV	   q2, q1, q0
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		   {q15}
+  vpop		   {q8 - q14}
+  vpop	   	 {q0 - q7}
+  pop		     {r4}
+  bx		     lr
+  ENDP
+
+RAWToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_raw
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush      {d0 - d7}
+  vpush      {q8}
+
+  vmov.u8    d4, #33                          ; R * 0.2578 coefficient
+  vmov.u8    d5, #65                          ; G * 0.5078 coefficient
+  vmov.u8    d6, #13                          ; B * 0.1016 coefficient
+  vmov.u8    d7, #16                          ; Add 16 constant
+
+1
+  MEMACCESS	0
+  vld3.8     {d0, d1, d2}, [r0]!              ; load 8 pixels of RAW.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q8, d0, d4                       ; B
+  vmlal.u8   q8, d1, d5                       ; G
+  vmlal.u8   q8, d2, d6                       ; R
+  vqrshrun.s16 d0, q8, #7                     ; 16 bit to 8 bit Y
+  vqadd.u8   d0, d7
+  MEMACCESS	1
+  vst1.8     {d0}, [r1]!                      ; store 8 pixels Y.
+  bgt        %b1
+
+  vpop 		  {q8}
+  vpop		  {d0-d7}
+  bx			  lr
+  ENDP
+
+
+RGB24ToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_rgb24
+  ;     r1 = int src_stride_rgb24
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; src_stride + src_rgb24
+  vmov.s16   q10, #112 / 2                    ; UB / VR 0.875 coefficient
+  vmov.s16   q11, #74 / 2                     ; UG -0.5781 coefficient
+  vmov.s16   q12, #38 / 2                     ; UR -0.2969 coefficient
+  vmov.s16   q13, #18 / 2                     ; VB -0.1406 coefficient
+  vmov.s16   q14, #94 / 2                     ; VG -0.7344 coefficient
+  vmov.u16   q15, #0x8080                     ; 128.5
+
+1
+  MEMACCESS	0
+  vld3.8     {d0, d2, d4}, [r0]!              ; load 8 RGB24 pixels.
+  MEMACCESS	0
+  vld3.8     {d1, d3, d5}, [r0]!              ; load next 8 RGB24 pixels.
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+  MEMACCESS	1
+  vld3.8     {d8, d10, d12}, [r1]!            ; load 8 more RGB24 pixels.
+  MEMACCESS	1
+  vld3.8     {d9, d11, d13}, [r1]!            ; load last 8 RGB24 pixels.
+  vpadal.u8  q0, q4                           ; B 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q5                           ; G 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q6                           ; R 16 bytes -> 8 shorts.
+
+  vrshr.u16  q0, q0, #1                       ; 2x average
+  vrshr.u16  q1, q1, #1
+  vrshr.u16  q2, q2, #1
+
+  subs       r4, r4, #16                      ; 32 processed per loop.
+  RGBTOUV    q0, q1, q2
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 pixels U.
+  MEMACCESS	3
+  vst1.8     {d1}, [r3]!                      ; store 8 pixels V.
+  bgt        %b1
+
+  vpop		   {q15}
+  vpop		   {q8 - q14}
+  vpop	   	 {q0 - q7}
+  pop		     {r4}
+  bx		     lr
+  ENDP
+
+UYVYToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uyvy
+  ;     r1 = int stride_uyvy
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {q0 - q7}
+  vpush 	   {q8 - q14}
+  vpush	     {q15}
+
+  add        r1, r0, r1                       ; stride + src_uyvy
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 16 pixels of UYVY.
+  subs       r4, r4, #16                      ; 16 pixels = 8 UVs.
+  MEMACCESS	1
+  vld4.8     {d4, d5, d6, d7}, [r1]!          ; load next row UYVY.
+  vrhadd.u8  d0, d0, d4                       ; average rows of U
+  vrhadd.u8  d2, d2, d6                       ; average rows of V
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 U.
+  MEMACCESS	3
+  vst1.8     {d2}, [r3]!                      ; store 8 V.
+  bgt        %b1
+
+  vpop		    {q15}
+  vpop		    {q8 - q14}
+  vpop	   	  {q0 - q7}
+  pop		    	{r4}
+  bx		    	lr
+  ENDP
+
+UYVYToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_uyvy
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush		  {q0, q1}
+
+1
+  MEMACCESS	0
+  vld2.8     {q0, q1}, [r0]!                  ; load 16 pixels of UYVY.
+  subs       r2, r2, #16                      ; 16 processed per loop.
+  MEMACCESS	1
+  vst1.8     {q1}, [r1]!                      ; store 16 pixels of Y.
+  bgt        %b1
+
+  vpop		   {q0, q1}
+  bx		     lr
+  ENDP
+
+YUY2ToUVRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_yuy2
+  ;     r1 = int stride_yuy2
+  ;     r2 = uint8* dst_u
+  ;     r3 = uint8* dst_v
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int pix
+  vpush	     {d0 - d7}
+
+  add        r1, r0, r1                       ; stride + src_yuy2
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 16 pixels of YUY2.
+  subs       r4, r4, #16                      ; 16 pixels = 8 UVs.
+  MEMACCESS	1
+  vld4.8     {d4, d5, d6, d7}, [r1]!          ; load next row YUY2.
+  vrhadd.u8  d1, d1, d5                       ; average rows of U
+  vrhadd.u8  d3, d3, d7                       ; average rows of V
+  MEMACCESS	2
+  vst1.8     {d1}, [r2]!                      ; store 8 U.
+  MEMACCESS	3
+  vst1.8     {d3}, [r3]!                      ; store 8 V.
+  bgt        %b1
+
+  vpop 	   {d0 - d7}
+  pop		   {r4}
+  bx		   lr
+  ENDP
+
+YUY2ToYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_yuy2
+  ;     r1 = uint8* dst_y
+  ;     r2 = int pix
+  vpush     {q0, q1}
+
+1
+  MEMACCESS	0
+  vld2.8     {q0, q1}, [r0]!                  ; load 16 pixels of YUY2.
+  subs       r2, r2, #16                      ; 16 processed per loop.
+  MEMACCESS	1
+  vst1.8     {q0}, [r1]!                      ; store 16 pixels of Y.
+  bgt        %b1
+
+  vpop       {q0, q1}
+  bx         lr
+  ENDP
+
+ARGBToRGB565DitherRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_rgb
+  ;     r2 = const uint32 dither4
+  ;     r3 = int width
+  vpush       {q0, q1}
+  vpush       {q8-q11}
+
+  vdup.32    d2, r2                           ; dither4
+1
+  MEMACCESS	1
+  vld4.8     {d20, d21, d22, d23}, [r0]!      ; load 8 pixels of ARGB.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vqadd.u8   d20, d20, d2
+  vqadd.u8   d21, d21, d2
+  vqadd.u8   d22, d22, d2
+  ARGBTORGB565
+  MEMACCESS	0
+  vst1.8     {q0}, [r1]!                      ; store 8 pixels RGB565.
+  bgt        %b1
+
+  vpop      {q8-q11}
+  vpop      {q0, q1}
+  bx        lr
+  ENDP
+
+; Add 2 rows of ARGB pixels together, 8 pixels at a time.
+ARGBAddRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb0
+  ;     r1 = uint8* src_argb1
+  ;     r2 = uint8* dst_arg
+  ;     r3 = int width
+   vpush       {q0 - q3}
+  ; 8 pixel loop.
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	1
+  vld4.8     {d4, d5, d6, d7}, [r1]!          ; load 8 more ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vqadd.u8   q0, q0, q2                       ; add B, G
+  vqadd.u8   q1, q1, q3                       ; add R, A
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop       {q0 - q3}
+  bx         lr
+  ENDP
+
+; Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+ARGBSubtractRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb0
+  ;     r1 = uint8* src_argb1
+  ;     r2 = uint8* dst_arg
+  ;     r3 = int width
+   vpush       {q0 - q3}
+  ; 8 pixel loop.
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	1
+  vld4.8     {d4, d5, d6, d7}, [r1]!          ; load 8 more ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vqsub.u8   q0, q0, q2                       ; subtract B, G
+  vqsub.u8   q1, q1, q3                       ; subtract R, A
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop       {q0 - q3}
+  bx         lr
+  ENDP
+
+; Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+; A = 255
+; R = Sobel
+; G = Sobel
+; B = Sobel
+SobelRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_sobelx
+  ;     r1 = const uint8* src_sobely
+  ;     r2 = uint8* dst_argb
+  ;     r3 = int width
+  vpush       {q0 - q1}
+  vmov.u8    d3, #255                         ; alpha
+  ; 8 pixel loop.
+
+1
+  MEMACCESS	0
+  vld1.8     {d0}, [r0]!                      ; load 8 sobelx.
+  MEMACCESS	1
+  vld1.8     {d1}, [r1]!                      ; load 8 sobely.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vqadd.u8   d0, d0, d1                       ; add
+  vmov.u8    d1, d0
+  vmov.u8    d2, d0
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop       {q0 - q1}
+  bx         lr
+  ENDP
+
+; Adds Sobel X and Sobel Y and stores Sobel into plane.
+SobelToPlaneRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_sobelx
+  ;     r1 = const uint8* src_sobely
+  ;     r2 = uint8* dst_y
+  ;     r3 = int width
+  vpush       {q0 - q1}
+  ; 16 pixel loop.
+
+1
+  MEMACCESS	0
+  vld1.8     {q0}, [r0]!                      ; load 16 sobelx.
+  MEMACCESS	1
+  vld1.8     {q1}, [r1]!                      ; load 16 sobely.
+  subs       r3, r3, #16                      ; 16 processed per loop.
+  vqadd.u8   q0, q0, q1                       ; add
+  MEMACCESS	2
+  vst1.8     {q0}, [r2]!                      ; store 16 pixels.
+  bgt        %b1
+
+  vpop       {q0 - q1}
+  bx         lr
+  ENDP
+
+; Attenuate 8 pixels at a time.
+ARGBAttenuateRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = const uint8* dst_argb
+  ;     r2 = int width
+  vpush       {q0 - q1}
+  vpush       {q10 - q12}
+
+  ; Attenuate 8 pixels.
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q10, d0, d3                      ; b * a
+  vmull.u8   q11, d1, d3                      ; g * a
+  vmull.u8   q12, d2, d3                      ; r * a
+  vqrshrn.u16 d0, q10, #8                     ; b >>= 8
+  vqrshrn.u16 d1, q11, #8                     ; g >>= 8
+  vqrshrn.u16 d2, q12, #8                     ; r >>= 8
+  MEMACCESS	1
+  vst4.8     {d0, d1, d2, d3}, [r1]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop  	  {q10 - q12}
+  vpop		  {q0 - q1}
+  bx			  lr
+  ENDP
+
+; Quantize 8 ARGB pixels (32 bytes).
+; dst = (dst * scale >> 16) * interval_size + interval_offset;
+ARGBQuantizeRow_NEON PROC
+  ; input
+  ;     r0 = uint8* dst_argb
+  ;     r1 = int scale
+  ;     r2 = int interval_size
+  ;     r3 = int interval_offset
+  push       {r2 - r4}
+  ldr        r4, [sp,#12]                      ; int width
+  vpush	     {q0 - q3}
+  vpush	     {q8 - q10}
+
+  vdup.u16   q8, r1
+  vshr.u16   q8, q8, #1                       ; scale >>= 1
+  vdup.u16   q9, r2                           ; interval multiply.
+  vdup.u16   q10, r3                          ; interval add
+
+  ; 8 pixel loop.
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]           ; load 8 pixels of ARGB.
+  subs       r4, r4, #8                       ; 8 processed per loop.
+  vmovl.u8   q0, d0                           ; b (0 .. 255)
+  vmovl.u8   q1, d2
+  vmovl.u8   q2, d4
+  vqdmulh.s16 q0, q0, q8                      ; b * scale
+  vqdmulh.s16 q1, q1, q8                      ; g
+  vqdmulh.s16 q2, q2, q8                      ; r
+  vmul.u16   q0, q0, q9                       ; b * interval_size
+  vmul.u16   q1, q1, q9                       ; g
+  vmul.u16   q2, q2, q9                       ; r
+  vadd.u16   q0, q0, q10                      ; b + interval_offset
+  vadd.u16   q1, q1, q10                      ; g
+  vadd.u16   q2, q2, q10                      ; r
+  vqmovn.u16 d0, q0
+  vqmovn.u16 d2, q1
+  vqmovn.u16 d4, q2
+  MEMACCESS	0
+  vst4.8     {d0, d2, d4, d6}, [r0]!          ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop		  {q8 - q10}
+  vpop		  {q0 - q3}
+  pop		  	{r2 - r4}
+  bx		  	lr
+  ENDP
+
+; Shade 8 pixels at a time by specified value.
+; NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+; Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+ARGBShadeRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int width
+  ;     r3 = int value
+  vpush	     {q0}
+  vpush	     {q10 - q13}
+
+  vdup.u32   q0, r3                           ; duplicate scale value.
+  vzip.u8    d0, d1                           ; d0 aarrggbb.
+  vshr.u16   q0, q0, #1                       ; scale / 2.
+
+  ; 8 pixel loop.
+1
+  MEMACCESS	0
+  vld4.8     {d20, d22, d24, d26}, [r0]!      ; load 8 pixels of ARGB.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmovl.u8   q10, d20                         ; b (0 .. 255)
+  vmovl.u8   q11, d22
+  vmovl.u8   q12, d24
+  vmovl.u8   q13, d26
+  vqrdmulh.s16 q10, q10, d0[0]                ; b * scale * 2
+  vqrdmulh.s16 q11, q11, d0[1]                ; g
+  vqrdmulh.s16 q12, q12, d0[2]                ; r
+  vqrdmulh.s16 q13, q13, d0[3]                ; a
+  vqmovn.u16 d20, q10
+  vqmovn.u16 d22, q11
+  vqmovn.u16 d24, q12
+  vqmovn.u16 d26, q13
+  MEMACCESS	1
+  vst4.8     {d20, d22, d24, d26}, [r1]!      ; store 8 pixels of ARGB.
+  bgt        %b1
+
+  vpop		  {q10 - q13}
+  vpop		  {q0}
+  bx		  	lr
+  ENDP
+
+; Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+; Similar to ARGBToYJ but stores ARGB.
+; C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+ARGBGrayRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = int width
+  vpush	     {q0- q2}
+  vpush	     {q12 - q13}
+
+  vmov.u8    d24, #15                         ; B * 0.11400 coefficient
+  vmov.u8    d25, #75                         ; G * 0.58700 coefficient
+  vmov.u8    d26, #38                         ; R * 0.29900 coefficient
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 ARGB pixels.
+  subs       r2, r2, #8                       ; 8 processed per loop.
+  vmull.u8   q2, d0, d24                      ; B
+  vmlal.u8   q2, d1, d25                      ; G
+  vmlal.u8   q2, d2, d26                      ; R
+  vqrshrun.s16 d0, q2, #7                     ; 15 bit to 8 bit B
+  vmov       d1, d0                           ; G
+  vmov       d2, d0                           ; R
+  MEMACCESS	1
+  vst4.8     {d0, d1, d2, d3}, [r1]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop		  {q12 - q13}
+  vpop		  {q0 - q2}
+  bx			  lr
+  ENDP
+
+; Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+;    b = (r * 35 + g * 68 + b * 17) >> 7
+;    g = (r * 45 + g * 88 + b * 22) >> 7
+;    r = (r * 50 + g * 98 + b * 24) >> 7
+ARGBSepiaRow_NEON PROC
+  ; input
+  ;     r0 = uint8* dst_argb
+  ;     r1 = int width
+  vpush	     {q0- q3}
+  vpush	     {q10 - q15}
+
+  vmov.u8    d20, #17                         ; BB coefficient
+  vmov.u8    d21, #68                         ; BG coefficient
+  vmov.u8    d22, #35                         ; BR coefficient
+  vmov.u8    d24, #22                         ; GB coefficient
+  vmov.u8    d25, #88                         ; GG coefficient
+  vmov.u8    d26, #45                         ; GR coefficient
+  vmov.u8    d28, #24                         ; BB coefficient
+  vmov.u8    d29, #98                         ; BG coefficient
+  vmov.u8    d30, #50                         ; BR coefficient
+
+1
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]           ; load 8 ARGB pixels.
+  subs       r1, r1, #8                       ; 8 processed per loop.
+  vmull.u8   q2, d0, d20                      ; B to Sepia B
+  vmlal.u8   q2, d1, d21                      ; G
+  vmlal.u8   q2, d2, d22                      ; R
+  vmull.u8   q3, d0, d24                      ; B to Sepia G
+  vmlal.u8   q3, d1, d25                      ; G
+  vmlal.u8   q3, d2, d26                      ; R
+  vmull.u8   q8, d0, d28                      ; B to Sepia R
+  vmlal.u8   q8, d1, d29                      ; G
+  vmlal.u8   q8, d2, d30                      ; R
+  vqshrn.u16 d0, q2, #7                       ; 16 bit to 8 bit B
+  vqshrn.u16 d1, q3, #7                       ; 16 bit to 8 bit G
+  vqshrn.u16 d2, q8, #7                       ; 16 bit to 8 bit R
+  MEMACCESS	0
+  vst4.8     {d0, d1, d2, d3}, [r0]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop	  	{q10 - q15}
+  vpop	  	{q0 - q3}
+  bx		  	lr
+  ENDP
+
+; Tranform 8 ARGB pixels (32 bytes) with color matrix.
+; TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+; needs to saturate.  Consider doing a non-saturating version.
+ARGBColorMatrixRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_argb
+  ;     r1 = uint8* dst_argb
+  ;     r2 = const int8* matrix_argb
+  ;     r3 = int width
+  vpush	     {q0 - q7}
+  vpush	     {q8 - q14}
+  vpush		   {q15}
+
+  MEMACCESS	3
+  vld1.8     {q2}, [r2]                       ; load 3 ARGB vectors.
+  vmovl.s8   q0, d4                           ; B,G coefficients s16.
+  vmovl.s8   q1, d5                           ; R,A coefficients s16.
+
+1
+  MEMACCESS	0
+  vld4.8     {d16, d18, d20, d22}, [r0]!      ; load 8 ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vmovl.u8   q8, d16                          ; b (0 .. 255) 16 bit
+  vmovl.u8   q9, d18                          ; g
+  vmovl.u8   q10, d20                         ; r
+  vmovl.u8   q11, d22                         ; a
+  vmul.s16   q12, q8, d0[0]                   ; B = B * Matrix B
+  vmul.s16   q13, q8, d1[0]                   ; G = B * Matrix G
+  vmul.s16   q14, q8, d2[0]                   ; R = B * Matrix R
+  vmul.s16   q15, q8, d3[0]                   ; A = B * Matrix A
+  vmul.s16   q4, q9, d0[1]                    ; B += G * Matrix B
+  vmul.s16   q5, q9, d1[1]                    ; G += G * Matrix G
+  vmul.s16   q6, q9, d2[1]                    ; R += G * Matrix R
+  vmul.s16   q7, q9, d3[1]                    ; A += G * Matrix A
+  vqadd.s16  q12, q12, q4                     ; Accumulate B
+  vqadd.s16  q13, q13, q5                     ; Accumulate G
+  vqadd.s16  q14, q14, q6                     ; Accumulate R
+  vqadd.s16  q15, q15, q7                     ; Accumulate A
+  vmul.s16   q4, q10, d0[2]                   ; B += R * Matrix B
+  vmul.s16   q5, q10, d1[2]                   ; G += R * Matrix G
+  vmul.s16   q6, q10, d2[2]                   ; R += R * Matrix R
+  vmul.s16   q7, q10, d3[2]                   ; A += R * Matrix A
+  vqadd.s16  q12, q12, q4                     ; Accumulate B
+  vqadd.s16  q13, q13, q5                     ; Accumulate G
+  vqadd.s16  q14, q14, q6                     ; Accumulate R
+  vqadd.s16  q15, q15, q7                     ; Accumulate A
+  vmul.s16   q4, q11, d0[3]                   ; B += A * Matrix B
+  vmul.s16   q5, q11, d1[3]                   ; G += A * Matrix G
+  vmul.s16   q6, q11, d2[3]                   ; R += A * Matrix R
+  vmul.s16   q7, q11, d3[3]                   ; A += A * Matrix A
+  vqadd.s16  q12, q12, q4                     ; Accumulate B
+  vqadd.s16  q13, q13, q5                     ; Accumulate G
+  vqadd.s16  q14, q14, q6                     ; Accumulate R
+  vqadd.s16  q15, q15, q7                     ; Accumulate A
+  vqshrun.s16 d16, q12, #6                    ; 16 bit to 8 bit B
+  vqshrun.s16 d18, q13, #6                    ; 16 bit to 8 bit G
+  vqshrun.s16 d20, q14, #6                    ; 16 bit to 8 bit R
+  vqshrun.s16 d22, q15, #6                    ; 16 bit to 8 bit A
+  MEMACCESS	1
+  vst4.8     {d16, d18, d20, d22}, [r1]!      ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop	  	{q15}
+  vpop		  {q8 - q14}
+  vpop	  	{q0 - q7}
+  bx			  lr
+  ENDP
+
+  ; dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+ARGBBlendRow_NEON PROC
+   ; input
+  ;     r0 = const uint8* src_argb0
+  ;     r1 = const uint8* src_argb1
+  ;     r2 = int8* dst_argb
+  ;     r3 = int width
+  vpush	     {q0 - q3}
+  vpush	     {q10 - q12}
+
+  subs       r3, #8
+  blt        %f89
+  ; Blend 8 pixels.
+8
+  MEMACCESS	0
+  vld4.8     {d0, d1, d2, d3}, [r0]!          ; load 8 pixels of ARGB0.
+  MEMACCESS	1
+  vld4.8     {d4, d5, d6, d7}, [r1]!          ; load 8 pixels of ARGB1.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vmull.u8   q10, d4, d3                      ; db * a
+  vmull.u8   q11, d5, d3                      ; dg * a
+  vmull.u8   q12, d6, d3                      ; dr * a
+  vqrshrn.u16 d20, q10, #8                    ; db >>= 8
+  vqrshrn.u16 d21, q11, #8                    ; dg >>= 8
+  vqrshrn.u16 d22, q12, #8                    ; dr >>= 8
+  vqsub.u8   q2, q2, q10                      ; dbg - dbg * a / 256
+  vqsub.u8   d6, d6, d22                      ; dr - dr * a / 256
+  vqadd.u8   q0, q0, q2                       ; + sbg
+  vqadd.u8   d2, d2, d6                       ; + sr
+  vmov.u8    d3, #255                         ; a = 255
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 pixels of ARGB.
+  bge        %b8
+
+89
+  adds       r3, #8-1
+  blt        %f99
+
+  ; Blend 1 pixels.
+1
+  MEMACCESS	0
+  vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [r0]!   ; load 1 pixel ARGB0.
+  MEMACCESS	1
+  vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [r1]!   ; load 1 pixel ARGB1.
+  subs       r3, r3, #1                       ; 1 processed per loop.
+  vmull.u8   q10, d4, d3                      ; db * a
+  vmull.u8   q11, d5, d3                      ; dg * a
+  vmull.u8   q12, d6, d3                      ; dr * a
+  vqrshrn.u16 d20, q10, #8                    ; db >>= 8
+  vqrshrn.u16 d21, q11, #8                    ; dg >>= 8
+  vqrshrn.u16 d22, q12, #8                    ; dr >>= 8
+  vqsub.u8   q2, q2, q10                      ; dbg - dbg * a / 256
+  vqsub.u8   d6, d6, d22                      ; dr - dr * a / 256
+  vqadd.u8   q0, q0, q2                       ; + sbg
+  vqadd.u8   d2, d2, d6                       ; + sr
+  vmov.u8    d3, #255                         ; a = 255
+  MEMACCESS	2
+  vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [r2]!   ; store 1 pixel.
+  bge        %b1
+
+99
+
+  vpop 		  {q10 - q12}
+  vpop		  {q0 - q3}
+  bx			  lr
+  ENDP
+
+; Bilinear filter 16x2 -> 16x1
+InterpolateRow_NEON PROC
+   ; input
+  ;     r0 = const uint8* dst_ptr
+  ;     r1 = const uint8* src_ptr
+  ;     r2 = int8* dst_argb
+  ;     r3 = int width
+  push       {r4}
+  ldr        r4, [sp,#4]                      ; int width
+  vpush	     {q0 - q1}
+  vpush	     {d4 - d5}
+  vpush	  	 {q13 - q14}
+
+  cmp        r4, #0
+  beq        %f100
+  add        r2, r1
+  cmp        r4, #64
+  beq        %f75
+  cmp        r4, #128
+  beq        %f50
+  cmp        r4, #192
+  beq        %f25
+
+  vdup.8     d5, r4
+  rsb        r4, #256
+  vdup.8     d4, r4
+  ; General purpose row blend.
+1
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!
+  MEMACCESS	2
+  vld1.8     {q1}, [r2]!
+  subs       r3, r3, #16
+  vmull.u8   q13, d0, d4
+  vmull.u8   q14, d1, d4
+  vmlal.u8   q13, d2, d5
+  vmlal.u8   q14, d3, d5
+  vrshrn.u16 d0, q13, #8
+  vrshrn.u16 d1, q14, #8
+  MEMACCESS	0
+  vst1.8     {q0}, [r0]!
+  bgt        %b1
+  b          %f99
+
+  ; Blend 25 / 75.
+25
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!
+  MEMACCESS	2
+  vld1.8     {q1}, [r2]!
+  subs       r3, r3, #16
+  vrhadd.u8  q0, q1
+  vrhadd.u8  q0, q1
+  MEMACCESS	0
+  vst1.8     {q0}, [r0]!
+  bgt        %b25
+  b          %f99
+
+  ; Blend 50 / 50.
+50
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!
+  MEMACCESS	2
+  vld1.8     {q1}, [r2]!
+  subs       r3, r3, #16
+  vrhadd.u8  q0, q1
+  MEMACCESS	0
+  vst1.8     {q0}, [r0]!
+  bgt        %b50
+  b          %f99
+
+  ; Blend 75 / 25.
+75
+  MEMACCESS	1
+  vld1.8     {q1}, [r1]!
+  MEMACCESS	2
+  vld1.8     {q0}, [r2]!
+  subs       r3, r3, #16
+  vrhadd.u8  q0, q1
+  vrhadd.u8  q0, q1
+  MEMACCESS	0
+  vst1.8     {q0}, [r0]!
+  bgt        %b75
+  b          %f99
+
+  ; Blend 100 / 0 - Copy row unchanged.
+100
+  MEMACCESS	1
+  vld1.8     {q0}, [r1]!
+  subs       r3, r3, #16
+  MEMACCESS	0
+  vst1.8     {q0}, [r0]!
+  bgt        %b100
+
+99
+
+  vpop		  {q13 - q14}
+  vpop		  {d4 - d5}
+  vpop		  {q0 - q1}
+  pop			  {r4}
+  bx			  lr
+  ENDP
+
+; Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+ARGBMultiplyRow_NEON PROC
+   ; input
+  ;     r0 = const uint8* src_argb0
+  ;     r1 = const uint8* src_argb1
+  ;     r2 = int8* dst_argb
+  ;     r3 = int width
+  vpush	     {q0 - q3}
+
+  ; 8 pixel loop.
+1
+  MEMACCESS	0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS	1
+  vld4.8     {d1, d3, d5, d7}, [r1]!          ; load 8 more ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vmull.u8   q0, d0, d1                       ; multiply B
+  vmull.u8   q1, d2, d3                       ; multiply G
+  vmull.u8   q2, d4, d5                       ; multiply R
+  vmull.u8   q3, d6, d7                       ; multiply A
+  vrshrn.u16 d0, q0, #8                       ; 16 bit to 8 bit B
+  vrshrn.u16 d1, q1, #8                       ; 16 bit to 8 bit G
+  vrshrn.u16 d2, q2, #8                       ; 16 bit to 8 bit R
+  vrshrn.u16 d3, q3, #8                       ; 16 bit to 8 bit A
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop	  	 {q0 - q3}
+  bx			   lr
+  ENDP
+
+; SobelX as a matrix is
+; -1  0  1
+; -2  0  2
+; -1  0  1
+SobelXRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y0
+  ;     r1 = const uint8* src_y1
+  ;     r2 = const uint8* src_y2
+  ;     r3 = uint8* dst_sobelx
+  push      {r4 - r6}
+  ldr       r4, [sp,#12]                      ; int width
+  vpush	    {q0 - q1}
+  mov		   	r5, 2
+  mov		   	r6, 6
+
+1
+  MEMACCESS	0
+  vld1.8     {d0}, [r0],r5                    ; top
+  MEMACCESS	0
+  vld1.8     {d1}, [r0],r6
+  vsubl.u8   q0, d0, d1
+  MEMACCESS	1
+  vld1.8     {d2}, [r1],r5                    ; center * 2
+  MEMACCESS	1
+  vld1.8     {d3}, [r1],r6
+  vsubl.u8   q1, d2, d3
+  vadd.s16   q0, q0, q1
+  vadd.s16   q0, q0, q1
+  MEMACCESS	2
+  vld1.8     {d2}, [r2],r5                    ; bottom
+  MEMACCESS	2
+  vld1.8     {d3}, [r2],r6
+  subs       r4, r4, #8                       ; 8 pixels
+  vsubl.u8   q1, d2, d3
+  vadd.s16   q0, q0, q1
+  vabs.s16   q0, q0
+  vqmovn.u16 d0, q0
+  MEMACCESS	3
+  vst1.8     {d0}, [r3]!                      ; store 8 sobelx
+  bgt        %b1
+
+  vpop	  	{q0 - q1}
+  pop		  	{r4 - r6}
+  bx		  	lr
+  ENDP
+
+; SobelY as a matrix is
+; -1 -2 -1
+;  0  0  0
+;  1  2  1
+SobelYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_y0
+  ;     r1 = const uint8* src_y1
+  ;     r2 = uint8* dst_sobely
+  ;     r3 = int width
+  vpush	     {q0 - q1}
+  push	     {r4 - r5}
+  mov		     r4, 1
+  mov			   r5, 6
+
+1
+  MEMACCESS	0
+  vld1.8     {d0}, [r0],r4                    ; left
+  MEMACCESS	1
+  vld1.8     {d1}, [r1],r4
+  vsubl.u8   q0, d0, d1
+  MEMACCESS	0
+  vld1.8     {d2}, [r0],r4                    ; center * 2
+  MEMACCESS	1
+  vld1.8     {d3}, [r1],r4
+  vsubl.u8   q1, d2, d3
+  vadd.s16   q0, q0, q1
+  vadd.s16   q0, q0, q1
+  MEMACCESS	0
+  vld1.8     {d2}, [r0],r5                    ; right
+  MEMACCESS	1
+  vld1.8     {d3}, [r1],r5
+  subs       r3, r3, #8                       ; 8 pixels
+  vsubl.u8   q1, d2, d3
+  vadd.s16   q0, q0, q1
+  vabs.s16   q0, q0
+  vqmovn.u16 d0, q0
+  MEMACCESS	2
+  vst1.8     {d0}, [r2]!                      ; store 8 sobely
+  bgt        %b1
+
+  pop		     {r4 - r5}
+  vpop	     {q0 - q1}
+  bx		     lr
+  ENDP
+
+; Mixes Sobel X, Sobel Y and Sobel into ARGB.
+; A = 255
+; R = Sobel X
+; G = Sobel
+; B = Sobel Y
+SobelXYRow_NEON PROC
+  ; input
+  ;     r0 = const uint8* src_sobelx
+  ;     r1 = const uint8* src_sobely
+  ;     r2 = uint8* dst_argb
+  ;     r3 = int width
+  vpush	     {q0 - q1}
+
+  vmov.u8    d3, #255                         ; alpha
+  ; 8 pixel loop.
+
+1
+  MEMACCESS	0
+  vld1.8     {d2}, [r0]!                      ; load 8 sobelx.
+  MEMACCESS	1
+  vld1.8     {d0}, [r1]!                      ; load 8 sobely.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vqadd.u8   d1, d0, d2                       ; add
+  MEMACCESS	2
+  vst4.8     {d0, d1, d2, d3}, [r2]!          ; store 8 ARGB pixels.
+  bgt        %b1
+
+  vpop	     {q0 - q1}
+  bx		     lr
+  ENDP
+
+
+  END
+
+
diff --git a/source/row_neon.cc b/source/row_neon.cc
index bed14e0..3450302 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -17,6 +17,15 @@ namespace libyuv {
 extern "C" {
 #endif

+  /* !!! IMPORTANT: Following methods has been ported to pure assembler to row_neon.asm,
+  * because MS Visual Studio doesn't support inline assembler for ARM.
+  *
+  *  ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN row_neon.asm
+  *
+  * Eventually, only pure assembler implementation should be used for all platforms
+  * to avoid code duplication.
+  */
+
 // This module is for GCC Neon
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
diff --git a/source/scale_neon.asm b/source/scale_neon.asm
new file mode 100644
index 0000000..8ede0ec
--- /dev/null
+++ b/source/scale_neon.asm
@@ -0,0 +1,970 @@
+;
+;  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS. All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+  AREA  |.text|, CODE, READONLY, ALIGN=2
+
+  GET    source/arm_asm_macros.in
+
+  EXPORT ScaleRowDown2_NEON
+  EXPORT ScaleRowDown2Linear_NEON
+  EXPORT ScaleRowDown2Box_NEON
+  EXPORT ScaleRowDown4_NEON
+  EXPORT ScaleRowDown4Box_NEON
+  EXPORT ScaleRowDown34_NEON
+  EXPORT ScaleRowDown34_0_Box_NEON
+  EXPORT ScaleRowDown34_1_Box_NEON
+  EXPORT ScaleRowDown38_NEON
+  EXPORT ScaleRowDown38_3_Box_NEON
+  EXPORT ScaleRowDown38_2_Box_NEON
+  EXPORT ScaleAddRows_NEON
+  EXPORT ScaleFilterCols_NEON
+  EXPORT ScaleARGBRowDown2_NEON
+  EXPORT ScaleARGBRowDown2Linear_NEON
+  EXPORT ScaleARGBRowDown2Box_NEON
+  EXPORT ScaleARGBRowDownEven_NEON
+  EXPORT ScaleARGBRowDownEvenBox_NEON
+  EXPORT ScaleARGBCols_NEON
+  EXPORT ScaleARGBFilterCols_NEON
+  EXPORT ScaleARGBCols_NEON
+
+kShuf38       DCB   0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0
+kShuf38_2     DCB   0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0
+;vec16 kMult38_Div6 = { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }
+kMult38_Div6  DCW   0x1555,	0x1555,	0x1555,	0x1555,	0x1555,	0x1555, 0x1555, 0x1555
+;vec16 kMult38_Div9 = { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+kMult38_Div9  DCW 0xe38, 0xe38,	0xe38, 0xe38, 0xe38, 0xe38,	0xe38, 0xe38
+
+
+; Read 32x1 throw away even pixels, and write 16x1
+ScaleRowDown2_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst
+  ;     r3 = int dst_width
+  vpush       {q0, q1}
+1
+  ; load even pixels into q0, odd into q1
+  MEMACCESS  0
+  vld2.8     {q0, q1}, [r0]!
+  subs       r3, r3, #16                      ; 16 processed per loop
+  MEMACCESS  1
+  vst1.8     {q1}, [r2]!                      ; store odd pixels
+  bgt        %b1
+
+  vpop       {q0, q1}
+
+  bx        lr
+  ENDP
+
+; Read 32x1 average down and write 16x1.
+ScaleRowDown2Linear_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst
+  ;     r3 = int dst_width
+  vpush          {q0, q1}
+1
+  MEMACCESS  0
+  vld1.8     {q0, q1}, [r0]!                  ; load pixels and post inc
+  subs       r3, r3, #16                      ; 16 processed per loop
+  vpaddl.u8  q0, q0                           ; add adjacent
+  vpaddl.u8  q1, q1
+  vrshrn.u16 d0, q0, #1                       ; downshift, round and pack
+  vrshrn.u16 d1, q1, #1
+  MEMACCESS  1
+  vst1.8     {q0}, [r2]!
+  bgt        %b1
+  vpop           {q0, q1}
+
+  bx        lr
+  ENDP
+
+; Read 32x2 average down and write 16x1
+ScaleRowDown2Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst
+  ;     r3 = int dst_width
+  ; This file was created from a .asm file
+  vpush          {q0, q1, q2, q3}
+  add        r1, r0
+1
+  MEMACCESS  0
+  vld1.8     {q0, q1}, [r0]!                  ; load row 1 and post inc
+  MEMACCESS  1
+  vld1.8     {q2, q3}, [r1]!                  ; load row 2 and post inc
+  subs       r3, r3, #16                      ; 16 processed per loop
+  vpaddl.u8  q0, q0                           ; row 1 add adjacent
+  vpaddl.u8  q1, q1
+  vpadal.u8  q0, q2                           ; row 2 add adjacent + row1
+  vpadal.u8  q1, q3
+  vrshrn.u16 d0, q0, #2                       ; downshift, round and pack
+  vrshrn.u16 d1, q1, #2
+  MEMACCESS  2
+  vst1.8     {q0}, [r2]!
+  bgt        %b1
+  vpop           {q0, q1, q2, q3}
+
+  bx        lr
+  ENDP
+
+ScaleRowDown4_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush       {q0, q1}
+
+1
+  MEMACCESS   0
+  vld4.8      {d0, d1, d2, d3}, [r0]!        ; src line 0
+  subs        r3, r3, #8                     ; 8 processed per loop
+  MEMACCESS   1
+  vst1.8      {d2}, [r2]!
+  bgt         %b1
+
+  vpop        {q0, q1}
+  bx          lr
+  ENDP
+
+ScaleRowDown4Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  push       {r4-r6}
+  vpush      {q0-q3}
+  add        r4, r0, r1 ; src_ptr + src_stride
+  add        r5, r4, r1 ; src_ptr + src_stride * 2
+  add        r6, r5, r1 ; src_ptr + src_stride * 3
+
+1
+  MEMACCESS  0
+  vld1.8     {q0}, [r0]!                       ; load up 16x4
+  MEMACCESS  3
+  vld1.8     {q1}, [r4]!
+  MEMACCESS  4
+  vld1.8     {q2}, [r5]!
+  MEMACCESS  5
+  vld1.8     {q3}, [r6]!
+  subs       r3, r3, #4
+  vpaddl.u8  q0, q0
+  vpadal.u8  q0, q1
+  vpadal.u8  q0, q2
+  vpadal.u8  q0, q3
+  vpaddl.u16 q0, q0
+  vrshrn.u32 d0, q0, #4                        ; divide by 16 w/rounding
+  vmovn.u16  d0, q0
+  MEMACCESS  1
+  vst1.32    {d0[0]}, [r2]!
+  bgt        %b1
+
+  vpop      {q0-q3}
+  pop       {r4-r6}
+  bx        lr
+  ENDP
+
+ScaleRowDown34_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush     {d0-d3}
+
+1
+  MEMACCESS 0
+  vld4.8    {d0, d1, d2, d3}, [r0]!     ; src line 0
+  subs      r3, r3, #24
+  vmov      d2, d3                      ; order d0, d1, d2
+  MEMACCESS 1
+  vst3.8     {d0, d1, d2}, [r2]!
+  bgt        %b1
+
+  vpop      {d0-d3}
+  bx        lr
+  ENDP
+
+ScaleRowDown34_0_Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush     {q0-q3}
+  vpush     {q8-q11}
+  vpush     {d24}
+
+  vmov.u8    d24, #3
+  add        r1, r0
+1
+  MEMACCESS    0
+  vld4.8       {d0, d1, d2, d3}, [r0]!       ; src line 0
+  MEMACCESS    3
+  vld4.8       {d4, d5, d6, d7}, [r1]!       ; src line 1
+  subs         r3, r3, #24
+
+  ; filter src line 0 with src line 1
+  ; expand chars to shorts to allow for room
+  ; when adding lines together
+  vmovl.u8     q8, d4
+  vmovl.u8     q9, d5
+  vmovl.u8     q10, d6
+  vmovl.u8     q11, d7
+
+  ; 3 * line_0 + line_1
+  vmlal.u8     q8, d0, d24
+  vmlal.u8     q9, d1, d24
+  vmlal.u8     q10, d2, d24
+  vmlal.u8     q11, d3, d24
+
+  ; (3 * line_0 + line_1) >> 2
+  vqrshrn.u16  d0, q8, #2
+  vqrshrn.u16  d1, q9, #2
+  vqrshrn.u16  d2, q10, #2
+  vqrshrn.u16  d3, q11, #2
+
+  ; a0 = (src[0] * 3 + s[1] * 1) >> 2
+  vmovl.u8     q8, d1
+  vmlal.u8     q8, d0, d24
+  vqrshrn.u16  d0, q8, #2
+
+  ; a1 = (src[1] * 1 + s[2] * 1) >> 1
+  vrhadd.u8    d1, d1, d2
+
+  ; a2 = (src[2] * 1 + s[3] * 3) >> 2
+  vmovl.u8     q8, d2
+  vmlal.u8     q8, d3, d24
+  vqrshrn.u16  d2, q8, #2
+
+  MEMACCESS    1
+  vst3.8       {d0, d1, d2}, [r2]!
+
+  bgt          %b1
+
+
+  vpop      {d24}
+  vpop      {q8-q11}
+  vpop      {q0-q3}
+  bx        lr
+  ENDP
+
+ScaleRowDown34_1_Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush      {q0-q3}
+  vpush      {d24}
+  vmov.u8    d24, #3
+  add        r1, r0
+1
+  MEMACCESS    0
+  vld4.8       {d0, d1, d2, d3}, [r0]!       ; src line 0
+  MEMACCESS    3
+  vld4.8       {d4, d5, d6, d7}, [r1]!       ; src line 1
+  subs         r3, r3, #24
+  ; average src line 0 with src line 1
+  vrhadd.u8    q0, q0, q2
+  vrhadd.u8    q1, q1, q3
+
+  ; a0 = (src[0] * 3 + s[1] * 1) >> 2
+  vmovl.u8     q3, d1
+  vmlal.u8     q3, d0, d24
+  vqrshrn.u16  d0, q3, #2
+
+  ; a1 = (src[1] * 1 + s[2] * 1) >> 1
+  vrhadd.u8    d1, d1, d2
+
+  ; a2 = (src[2] * 1 + s[3] * 3) >> 2
+  vmovl.u8     q3, d2
+  vmlal.u8     q3, d3, d24
+  vqrshrn.u16  d2, q3, #2
+
+  MEMACCESS    1
+  vst3.8       {d0, d1, d2}, [r2]!
+  bgt          %b1
+
+  vpop      {d24}
+  vpop      {q0-q3}
+  bx        lr
+  ENDP
+
+ScaleRowDown38_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush     {d0-d5}
+  push      {r4}
+
+  adr       R4, kShuf38
+
+  vld1.8     {q3}, [r4]
+1
+  MEMACCESS  0
+  vld1.8     {d0, d1, d2, d3}, [r0]!
+  subs       r3, r3, #12
+  vtbl.u8    d4, {d0, d1, d2, d3}, d6
+  vtbl.u8    d5, {d0, d1, d2, d3}, d7
+  MEMACCESS(1)
+  vst1.8     {d4}, [r2]!
+  MEMACCESS(1)
+  vst1.32    {d5[0]}, [r2]!
+  bgt        %b1
+
+  vpop      {d0-d5}
+  pop       {r4}
+  bx        lr
+  ENDP
+
+ScaleRowDown38_3_Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush     {q0-q3}
+  vpush     {q8, q9}
+  vpush     {q13-q15}
+  push      {r4-r7}
+  add       r4, r0, r1
+  add       r4, r4, r1      ; src_ptr + src_stride * 2
+  adr       r5, kMult38_Div6
+  adr       r6, kShuf38_2
+  adr       r7, kMult38_Div9
+
+  MEMACCESS  5
+  vld1.16    {q13}, [r5]
+  MEMACCESS  6
+  vld1.8     {q14}, [r6]
+  MEMACCESS  7
+  vld1.8     {q15}, [r7]
+  add        r1, r0
+1
+  ; d0 = 00 40 01 41 02 42 03 43
+  ; d1 = 10 50 11 51 12 52 13 53
+  ; d2 = 20 60 21 61 22 62 23 63
+  ; d3 = 30 70 31 71 32 72 33 73
+  MEMACCESS    0
+  vld4.8       {d0, d1, d2, d3}, [r0]!
+  MEMACCESS    3
+  vld4.8       {d4, d5, d6, d7}, [r1]!
+  MEMACCESS    4
+  vld4.8       {d16, d17, d18, d19}, [r4]!
+  subs         r3, r3, #12
+
+  ; Shuffle the input data around to get align the data
+  ;  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+  ; d0 = 00 10 01 11 02 12 03 13
+  ; d1 = 40 50 41 51 42 52 43 53
+  vtrn.u8      d0, d1
+  vtrn.u8      d4, d5
+  vtrn.u8      d16, d17
+
+  ; d2 = 20 30 21 31 22 32 23 33
+  ; d3 = 60 70 61 71 62 72 63 73
+  vtrn.u8      d2, d3
+  vtrn.u8      d6, d7
+  vtrn.u8      d18, d19
+
+  ; d0 = 00+10 01+11 02+12 03+13
+  ; d2 = 40+50 41+51 42+52 43+53
+  vpaddl.u8    q0, q0
+  vpaddl.u8    q2, q2
+  vpaddl.u8    q8, q8
+
+  ; d3 = 60+70 61+71 62+72 63+73
+  vpaddl.u8    d3, d3
+  vpaddl.u8    d7, d7
+  vpaddl.u8    d19, d19
+
+  ; combine source lines
+  vadd.u16     q0, q2
+  vadd.u16     q0, q8
+  vadd.u16     d4, d3, d7
+  vadd.u16     d4, d19
+
+  ; dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+  ;             + s[6 + st * 1] + s[7 + st * 1]
+  ;             + s[6 + st * 2] + s[7 + st * 2]) / 6
+  vqrdmulh.s16 q2, q2, q13
+  vmovn.u16    d4, q2
+
+  ; Shuffle 2,3 reg around so that 2 can be added to the
+  ;  0,1 reg and 3 can be added to the 4,5 reg. This
+  ;  requires expanding from u8 to u16 as the 0,1 and 4,5
+  ;  registers are already expanded. Then do transposes
+  ;  to get aligned.
+  ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+  vmovl.u8     q1, d2
+  vmovl.u8     q3, d6
+  vmovl.u8     q9, d18
+
+  ; combine source lines
+  vadd.u16     q1, q3
+  vadd.u16     q1, q9
+
+  ; d4 = xx 20 xx 30 xx 22 xx 32
+  ; d5 = xx 21 xx 31 xx 23 xx 33
+  vtrn.u32     d2, d3
+
+  ; d4 = xx 20 xx 21 xx 22 xx 23
+  ; d5 = xx 30 xx 31 xx 32 xx 33
+  vtrn.u16     d2, d3
+
+  ; 0+1+2, 3+4+5
+  vadd.u16     q0, q1
+
+  ; Need to divide, but can't downshift as the the value
+  ;  isn't a power of 2. So multiply by 65536 / n
+  ;  and take the upper 16 bits.
+  vqrdmulh.s16 q0, q0, q15
+
+  ; Align for table lookup, vtbl requires registers to
+  ;  be adjacent
+  vmov.u8      d2, d4
+
+  vtbl.u8      d3, {d0, d1, d2}, d28
+  vtbl.u8      d4, {d0, d1, d2}, d29
+
+  MEMACCESS    1
+  vst1.8       {d3}, [r2]!
+  MEMACCESS    1
+  vst1.32      {d4[0]}, [r2]!
+  bgt          %b1
+
+  pop       {r4-r7}
+  vpop      {q13-q15}
+  vpop      {q8, q9}
+  vpop      {q0-q3}
+  bx        lr
+  ENDP
+
+ScaleRowDown38_2_Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint8* dst_ptr
+  ;     r3 = int dst_width
+  vpush     {q0-q3}
+  vpush     {q13-q14}
+  push      {r4, r5}
+  adr       r4, kMult38_Div6
+  adr       r5, kShuf38_2
+
+  MEMACCESS    4
+  vld1.16      {q13}, [r4]
+  MEMACCESS    5
+  vld1.8       {q14}, [r5]
+  add          r1, r0
+1
+  ; d0 = 00 40 01 41 02 42 03 43
+  ; d1 = 10 50 11 51 12 52 13 53
+  ; d2 = 20 60 21 61 22 62 23 63
+  ; d3 = 30 70 31 71 32 72 33 73
+  MEMACCESS    0
+  vld4.8       {d0, d1, d2, d3}, [r0]!
+  MEMACCESS    3
+  vld4.8       {d4, d5, d6, d7}, [r1]!
+  subs         r3, r3, #12
+
+  ; Shuffle the input data around to get align the data
+  ;  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+  ; d0 = 00 10 01 11 02 12 03 13
+  ; d1 = 40 50 41 51 42 52 43 53
+  vtrn.u8      d0, d1
+  vtrn.u8      d4, d5
+
+  ; d2 = 20 30 21 31 22 32 23 33
+  ; d3 = 60 70 61 71 62 72 63 73
+  vtrn.u8      d2, d3
+  vtrn.u8      d6, d7
+
+  ; d0 = 00+10 01+11 02+12 03+13
+  ; d2 = 40+50 41+51 42+52 43+53
+  vpaddl.u8    q0, q0
+  vpaddl.u8    q2, q2
+
+  ; d3 = 60+70 61+71 62+72 63+73
+  vpaddl.u8    d3, d3
+  vpaddl.u8    d7, d7
+
+  ; combine source lines
+  vadd.u16     q0, q2
+  vadd.u16     d4, d3, d7
+
+  ; dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+  vqrshrn.u16  d4, q2, #2
+
+  ; Shuffle 2,3 reg around so that 2 can be added to the
+  ;  0,1 reg and 3 can be added to the 4,5 reg. This
+  ;  requires expanding from u8 to u16 as the 0,1 and 4,5
+  ;  registers are already expanded. Then do transposes
+  ;  to get aligned.
+  ; q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+  vmovl.u8     q1, d2
+  vmovl.u8     q3, d6
+
+  ; combine source lines
+  vadd.u16     q1, q3
+
+  ; d4 = xx 20 xx 30 xx 22 xx 32
+  ; d5 = xx 21 xx 31 xx 23 xx 33
+  vtrn.u32     d2, d3
+
+  ; d4 = xx 20 xx 21 xx 22 xx 23
+  ; d5 = xx 30 xx 31 xx 32 xx 33
+  vtrn.u16     d2, d3
+
+  ; 0+1+2, 3+4+5
+  vadd.u16     q0, q1
+
+  ; Need to divide, but can't downshift as the the value
+  ;  isn't a power of 2. So multiply by 65536 / n
+  ;  and take the upper 16 bits.
+  vqrdmulh.s16 q0, q0, q13
+
+  ; Align for table lookup, vtbl requires registers to
+  ;  be adjacent
+  vmov.u8      d2, d4
+
+  vtbl.u8      d3, {d0, d1, d2}, d28
+  vtbl.u8      d4, {d0, d1, d2}, d29
+
+  MEMACCESS    1
+  vst1.8       {d3}, [r2]!
+  MEMACCESS    1
+  vst1.32      {d4[0]}, [r2]!
+  bgt          %b1
+
+  pop       {r4, r5}
+  vpop      {q13-q14}
+  vpop      {q0-q3}
+  bx        lr
+  ENDP
+
+ScaleAddRows_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = src_stride
+  ;     r2 = uint16* dst_ptr
+  ;     r3 = int dst_width
+  push      {r4, r5,  r12}
+  ldr       r4, [SP, #12]    ; int src_height
+  mov       r5, 0
+  vpush     {q0-q3}
+
+1
+  mov       r5, r0
+  mov       r12, r4
+  veor      q2, q2, q2
+  veor      q3, q3, q3
+2
+  ; load 16 pixels into q0
+  MEMACCESS   0
+  vld1.8     {q0}, [r5], r1
+  vaddw.u8   q3, q3, d1
+  vaddw.u8   q2, q2, d0
+  subs       r12, r12, #1
+  bgt        %b2
+  MEMACCESS  2
+  vst1.16    {q2, q3}, [r2]!                  ; store pixels
+  add        r0, r0, #16
+  subs       r3, r3, #16                      ; 16 processed per loop
+  bgt        %b1
+
+  vpop      {q0-q3}
+  pop       {r4, r5, r12}
+  bx        lr
+  ENDP
+
+; TODO(Yang Zhang): Investigate less load instructions for
+; the x/dx stepping
+  MACRO
+  LOAD2_DATA8_LANE  $n
+  lsr        r5, r3, #16
+  add        r6, r1, r5
+  add        r3, r3, r4
+  MEMACCESS  6
+  vld2.8     {d6[$n], d7[$n]}, [r6]
+  MEND
+
+dx_offset DCD  0, 1, 2, 3
+
+; The NEON version mimics this formula:
+; #define BLENDER(a, b, f) (uint8)((int)(a) +
+;    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+ScaleFilterCols_NEON PROC
+  ; input
+  ;     r0 = uint8* dst_ptr
+  ;     r1 = uint8* src_ptr
+  ;     r2 = int dst_width
+  ;     r3 = int x
+
+  push       {r4-r6}
+
+  ldr        r4, [sp, #12]   ; int dx
+  adr        r5,  dx_offset
+  mov        r6,  r1
+
+  vpush      {q0-q3}
+  vpush      {q8-q13}
+
+  vdup.32    q0, r3                           ; x
+  vdup.32    q1, r4                           ; dx
+  vld1.32    {q2}, [r5]                       ; 0 1 2 3
+  vshl.i32   q3, q1, #2                       ; 4 * dx
+  vmul.s32   q1, q1, q2
+  ; x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+  vadd.s32   q1, q1, q0
+  ; x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+  vadd.s32   q2, q1, q3
+  vshl.i32   q0, q3, #1                       ; 8 * dx
+1
+  LOAD2_DATA8_LANE  0
+  LOAD2_DATA8_LANE  1
+  LOAD2_DATA8_LANE  2
+  LOAD2_DATA8_LANE  3
+  LOAD2_DATA8_LANE  4
+  LOAD2_DATA8_LANE  5
+  LOAD2_DATA8_LANE  6
+  LOAD2_DATA8_LANE  7
+  vmov       q10, q1
+  vmov       q11, q2
+  vuzp.16    q10, q11
+  vmovl.u8   q8, d6
+  vmovl.u8   q9, d7
+  vsubl.s16  q11, d18, d16
+  vsubl.s16  q12, d19, d17
+  vmovl.u16  q13, d20
+  vmovl.u16  q10, d21
+  vmul.s32   q11, q11, q13
+  vmul.s32   q12, q12, q10
+  vrshrn.s32  d18, q11, #16
+  vrshrn.s32  d19, q12, #16
+  vadd.s16   q8, q8, q9
+  vmovn.s16  d6, q8
+
+  MEMACCESS  0
+  vst1.8     {d6}, [r0]!                      ; store pixels
+  vadd.s32   q1, q1, q0
+  vadd.s32   q2, q2, q0
+  subs       r2, r2, #8                       ; 8 processed per loop
+  bgt        %b1
+
+  vpop       {q8-q13}
+  vpop       {q0-q3}
+  pop        {r4-r6}
+  bx         lr
+  ENDP
+
+ScaleARGBRowDown2_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = ptrdiff_t src_stride
+  ;     r2 = uint8* dst
+  ;     r3 = int dst_width
+  vpush      {q0 - q3}
+1
+  ; load even pixels into q0, odd into q1
+  MEMACCESS  0
+  vld2.32    {q0, q1}, [r0]!
+  MEMACCESS  0
+  vld2.32    {q2, q3}, [r0]!
+  subs       r3, r3, #8               ; 8 processed per loop
+  MEMACCESS  1
+  vst1.8     {q1}, [r2]!              ; store odd pixels
+  MEMACCESS  1
+  vst1.8     {q3}, [r2]!
+  bgt        %b1
+  vpop       {q0 - q3}
+  bx         lr
+  ENDP
+
+
+
+ScaleARGBRowDown2Linear_NEON PROC
+  ; input
+  ;     r0 = uint8* src_argb
+  ;     r1 = ptrdiff_t src_stride
+  ;     r2 = uint8* dst_argb
+  ;     r3 = int dst_width
+  vpush      {q0 - q3}
+1
+  MEMACCESS  0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 ARGB pixels.
+  MEMACCESS  0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 ARGB pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop
+  vpaddl.u8  q0, q0                           ; B 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; G 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; R 16 bytes -> 8 shorts.
+  vpaddl.u8  q3, q3                           ; A 16 bytes -> 8 shorts.
+  vrshrn.u16 d0, q0, #1                       ; downshift, round and pack
+  vrshrn.u16 d1, q1, #1
+  vrshrn.u16 d2, q2, #1
+  vrshrn.u16 d3, q3, #1
+  MEMACCESS  1
+  vst4.8     {d0, d1, d2, d3}, [r2]!
+  bgt        %b1
+
+
+  vpop       {q0 - q3}
+  bx         lr
+  ENDP
+
+ScaleARGBRowDown2Box_NEON PROC
+  ; input
+  ;     r0 = uint8* src_ptr
+  ;     r1 = ptrdiff_t src_stride
+  ;     r2 = uint8* dst
+  ;     r3 = int dst_width
+  vpush      {q0 - q3}
+  vpush      {q8 - q11}
+  ; change the stride to row 2 pointer
+  add        r1, r1, r0
+
+1
+  MEMACCESS  0
+  vld4.8     {d0, d2, d4, d6}, [r0]!          ; load 8 argb pixels.
+  MEMACCESS  0
+  vld4.8     {d1, d3, d5, d7}, [r0]!          ; load next 8 argb pixels.
+  subs       r3, r3, #8                       ; 8 processed per loop.
+  vpaddl.u8  q0, q0                           ; b 16 bytes -> 8 shorts.
+  vpaddl.u8  q1, q1                           ; g 16 bytes -> 8 shorts.
+  vpaddl.u8  q2, q2                           ; r 16 bytes -> 8 shorts.
+  vpaddl.u8  q3, q3                           ; a 16 bytes -> 8 shorts.
+  MEMACCESS  1
+  vld4.8     {d16, d18, d20, d22}, [r1]!      ; load 8 more argb pixels.
+  MEMACCESS  1
+  vld4.8     {d17, d19, d21, d23}, [r1]!      ; load last 8 argb pixels.
+  vpadal.u8  q0, q8                           ; b 16 bytes -> 8 shorts.
+  vpadal.u8  q1, q9                           ; g 16 bytes -> 8 shorts.
+  vpadal.u8  q2, q10                          ; r 16 bytes -> 8 shorts.
+  vpadal.u8  q3, q11                          ; a 16 bytes -> 8 shorts.
+  vrshrn.u16 d0, q0, #2                       ; downshift, round and pack
+  vrshrn.u16 d1, q1, #2
+  vrshrn.u16 d2, q2, #2
+  vrshrn.u16 d3, q3, #2
+  MEMACCESS  2
+  vst4.8     {d0, d1, d2, d3}, [r2]!
+  bgt        %b1
+
+  vpop       {q8 - q11}
+  vpop       {q0 - q3}
+  bx         lr
+  ENDP
+
+ScaleARGBRowDownEven_NEON PROC
+  ; input
+  ;     r0 = uint8* src_argb
+  ;     r1 = ptrdiff_t src_stride
+  ;     r2 = int src_stepx
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r12}
+  ldr       r4, [sp, #8]   ;int dst_width
+  vpush     {q0}
+
+  mov        r12, r2, lsl #2
+1
+  MEMACCESS  0
+  vld1.32    {d0[0]}, [r0], r12
+  MEMACCESS  0
+  vld1.32    {d0[1]}, [r0], r12
+  MEMACCESS  0
+  vld1.32    {d1[0]}, [r0], r12
+  MEMACCESS  0
+  vld1.32    {d1[1]}, [r0], r12
+  subs       r4, r4, #4                       ; 4 pixels per loop.
+  MEMACCESS  1
+  vst1.8     {q0}, [r3]!
+  bgt        %b1
+
+  vpop      {q0}
+  pop       {r4, r12}
+  bx         lr
+  ENDP
+
+ScaleARGBRowDownEvenBox_NEON PROC
+  ; input
+  ;     r0 = uint8* src_argb
+  ;     r1 = ptrdiff_t src_stride
+  ;     r2 = int src_stepx
+  ;     r3 =  uint8* dst_argb
+  push      {r4, r12}
+  ldr       r4, [sp, #8]   ;int dst_width
+  vpush     {q0 - q3}
+
+  mov        r12, r2, lsl #2
+  add        r1, r1, r0
+1
+  MEMACCESS  0
+  vld1.8     {d0}, [r0], r12                  ; Read 4 2x2 blocks -> 2x1
+  MEMACCESS  1
+  vld1.8     {d1}, [r1], r12
+  MEMACCESS  0
+  vld1.8     {d2}, [r0], r12
+  MEMACCESS  1
+  vld1.8     {d3}, [r1], r12
+  MEMACCESS  0
+  vld1.8     {d4}, [r0], r12
+  MEMACCESS  1
+  vld1.8     {d5}, [r1], r12
+  MEMACCESS  0
+  vld1.8     {d6}, [r0], r12
+  MEMACCESS  1
+  vld1.8     {d7}, [r1], r12
+  vaddl.u8   q0, d0, d1
+  vaddl.u8   q1, d2, d3
+  vaddl.u8   q2, d4, d5
+  vaddl.u8   q3, d6, d7
+  vswp.8     d1, d2                           ; ab_cd -> ac_bd
+  vswp.8     d5, d6                           ; ef_gh -> eg_fh
+  vadd.u16   q0, q0, q1                       ; (a+b)_(c+d)
+  vadd.u16   q2, q2, q3                       ; (e+f)_(g+h)
+  vrshrn.u16 d0, q0, #2                       ; first 2 pixels.
+  vrshrn.u16 d1, q2, #2                       ; next 2 pixels.
+  subs       r4, r4, #4                       ; 4 pixels per loop.
+  MEMACCESS  2
+  vst1.8     {q0}, [r3]!
+  bgt        %b1
+
+  vpop      {q0 - q3}
+  pop       {r4, r12}
+  bx         lr
+  ENDP
+
+  ; TODO(Yang Zhang): Investigate less load instructions for
+  ; the x/dx stepping
+  MACRO
+  LOAD1_DATA32_LANE $dn,  $n
+  lsr        r5, r3, #16
+  add        r6, r1, r5, lsl #2
+  add        r3, r3, r4
+  MEMACCESS  6
+  vld1.32    {$dn[$n]}, [r6]
+  MEND
+
+ScaleARGBCols_NEON PROC
+  ; input
+  ;     r0 = uint8* dst_argb
+  ;     r1 = const uint8* src_argb
+  ;     r2 = int dst_width
+  ;     r3 = int x
+  push       {r4 - r6}
+  ldr        r4, [sp,#12]    ; int dx
+  mov        r6, r1
+  vpush      {q0, q1}
+
+1
+  LOAD1_DATA32_LANE d0, 0
+  LOAD1_DATA32_LANE d0, 1
+  LOAD1_DATA32_LANE d1, 0
+  LOAD1_DATA32_LANE d1, 1
+  LOAD1_DATA32_LANE d2, 0
+  LOAD1_DATA32_LANE d2, 1
+  LOAD1_DATA32_LANE d3, 0
+  LOAD1_DATA32_LANE d3, 1
+
+  MEMACCESS	0
+  vst1.32     {q0, q1}, [r0]!                 ; store pixels
+  subs       r2, r2, #8                       ; 8 processed per loop
+  bgt        %b1
+
+
+  vpop       {q0, q1}
+  pop        {r4 - r6}
+  bx         lr
+  ENDP
+
+  ; TODO(Yang Zhang): Investigate less load instructions for
+  ; the x/dx stepping
+  MACRO
+  LOAD2_DATA32_LANE $dn1, $dn2,  $n
+  lsr        r5, r3, #16
+  add        r6, r1, r5, lsl #2
+  add        r3, r3, r4
+  MEMACCESS  6
+  vld2.32    {$dn1[$n], $dn2[$n]}, [r6]
+  MEND
+
+ScaleARGBFilterCols_NEON PROC
+  ; input
+  ;     r0 = uint8* dst_argb
+  ;     r1 = const uint8* src_argb
+  ;     r2 = int dst_width
+  ;     r3 = int x
+
+  push       {r4 - r6}
+  ldr        r4, [sp,#12]    ;int dx
+  adr        r5,  dx_offset
+  mov        r6, r1
+  vpush      {q0 - q3}
+  vpush      {q8 - q15}
+
+  vdup.32    q0, r3                           ; x
+  vdup.32    q1, r4                           ; dx
+  vld1.32    {q2}, [r5]                       ; 0 1 2 3
+  vshl.i32   q9, q1, #2                       ; 4 * dx
+  vmul.s32   q1, q1, q2
+  vmov.i8    q3, #0x7f                        ; 0x7F
+  vmov.i16   q15, #0x7f                       ; 0x7F
+  ; x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+  vadd.s32   q8, q1, q0
+1
+  ; d0, d1: a
+  ; d2, d3: b
+  LOAD2_DATA32_LANE d0, d2, 0
+  LOAD2_DATA32_LANE d0, d2, 1
+  LOAD2_DATA32_LANE d1, d3, 0
+  LOAD2_DATA32_LANE d1, d3, 1
+  vshrn.i32   d22, q8, #9
+  vand.16     d22, d22, d30
+  vdup.8      d24, d22[0]
+  vdup.8      d25, d22[2]
+  vdup.8      d26, d22[4]
+  vdup.8      d27, d22[6]
+  vext.8      d4, d24, d25, #4
+  vext.8      d5, d26, d27, #4                ; f
+  veor.8      q10, q2, q3                     ; 0x7f ^ f
+  vmull.u8    q11, d0, d20
+  vmull.u8    q12, d1, d21
+  vmull.u8    q13, d2, d4
+  vmull.u8    q14, d3, d5
+  vadd.i16    q11, q11, q13
+  vadd.i16    q12, q12, q14
+  vshrn.i16   d0, q11, #7
+  vshrn.i16   d1, q12, #7
+
+  MEMACCESS	  0
+  vst1.32     {d0, d1}, [r0]!                 ; store pixels
+  vadd.s32    q8, q8, q9
+  subs        r2, r2, #4                      ; 4 processed per loop
+  bgt         %b1
+
+  vpop       {q8 - q15}
+  vpop       {q0 - q3}
+  pop        {r4 - r6}
+  bx         lr
+  ENDP
+
+  END
+
+
+
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 9b4dce3..f98c939 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -15,6 +15,16 @@ namespace libyuv {
 extern "C" {
 #endif

+  /* !!! IMPORTANT: Following methods has been ported to pure assembler to scale_neon.asm,
+  * because MS Visual Studio doesn't support inline assembler for ARM.
+  *
+  *  ALL CHANGES IN METHODS IMPLEMENTATION HAS TO BE DONE ALSO IN scale_neon.asm
+  *
+  * Eventually, only pure assembler implementation should be used for all platforms
+  * to avoid code duplication.
+  */
+
+
 // This module is for GCC Neon.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 7f8bcf8..9dd7ae4 100644
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -388,7 +388,7 @@ LibYUVBaseTest::LibYUVBaseTest()
                        1280.0);
 }

-int main(int argc, char** argv) {
+int main(int argc, char* argv[]) {
   ::testing::InitGoogleTest(&argc, argv);
   // AllowCommandLineParsing allows us to ignore flags passed on to us by
   // Chromium build bots without having to explicitly disable them.
diff --git a/util/compare.cc b/util/compare.cc
index ef0beef..5066b61 100644
--- a/util/compare.cc
+++ b/util/compare.cc
@@ -17,7 +17,7 @@
 #include "libyuv/compare.h"
 #include "libyuv/version.h"

-int main(int argc, char** argv) {
+int main(int argc, char* argv[]) {
   if (argc < 1) {
     printf("libyuv compare v%d\n", LIBYUV_VERSION);
     printf("compare file1.yuv file2.yuv\n");
@@ -62,4 +62,5 @@ int main(int argc, char** argv) {
     fclose(fin2);
   }
   fclose(fin1);
+  return 0;
 }
diff --git a/util/convert.cc b/util/convert.cc
index acaf43a..3cb0b51 100644
--- a/util/convert.cc
+++ b/util/convert.cc
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
   exit(0);
 }

-void ParseOptions(int argc, const char* argv[]) {
+void ParseOptions(int argc, char* argv[]) {
   if (argc <= 1)
     PrintHelp(argv[0]);
   for (int c = 1; c < argc; ++c) {
@@ -189,7 +189,7 @@ static int TileARGBScale(const uint8* src_argb,
   return 0;
 }

-int main(int argc, const char* argv[]) {
+int main(int argc, char* argv[]) {
   ParseOptions(argc, argv);

   // Open original file (first file argument)
diff --git a/util/psnr_main.cc b/util/psnr_main.cc
index 01e8777..2d412a3 100644
--- a/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -149,7 +149,7 @@ void PrintHelp(const char* program) {
   exit(0);
 }

-void ParseOptions(int argc, const char* argv[]) {
+void ParseOptions(int argc, char* argv[]) {
   if (argc <= 1)
     PrintHelp(argv[0]);
   for (int c = 1; c < argc; ++c) {
@@ -315,7 +315,7 @@ bool UpdateMetrics(uint8* ch_org,
   return ismin;
 }

-int main(int argc, const char* argv[]) {
+int main(int argc, char* argv[]) {
   ParseOptions(argc, argv);
   if (!do_psnr && !do_ssim) {
     do_psnr = true;