commshare/yuv2rgb.neon.S

## yuv2rgb.neon.S
/**这个是编译文件，可以直接把S文件编译为动态库
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/Android.mk
*/
LOCAL_PATH := $(call my-dir)

#the yuv2rgb library
include $(CLEAR_VARS)
LOCAL_ALLOW_UNDEFINED_SYMBOLS=false
LOCAL_MODULE := yuv2rgb

LOCAL_CFLAGS := -I$(AVPLAYER_PATH)"/jni/yuv2rgb/include" -D__STDC_CONSTANT_MACROS
LOCAL_CFLAGS += $(CC_OPTIMIZE_FLAG)

LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.S src/yuv420rgb565.S
#LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.s src/yuv420rgb565.s src/yuv422rgb565.s src/yuv2rgb555.s src/yuv2rgbX.s src/yuv420rgb888.s src/yuv422rgb565.s src/yuv422rgb888.s src/yuv422rgb8888.s src/yuv444rgb565.s src/yuv444rgb888.s src/yuv444rgb8888.s

ifeq ($(TARGET_ARCH_ABI),x86)
   LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c
endif

ifeq ($(TARGET_ARCH_ABI),mips)
   LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c
endif

LOCAL_SHARED_LIBRARIES :=
LOCAL_STATIC_LIBRARIES :=
LOCAL_LDLIBS := -ldl -llog

include $(BUILD_SHARED_LIBRARY)

/*##########这个是头文件#################
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/include/yuv2rgb.neon.h
*/
/*
 *      Copyright (C) 2005-2013 Team XBMC
 *      http://xbmc.org
 *
 *  This Program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  This Program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with XBMC; see the file COPYING.  If not, see
 *  <http://www.gnu.org/licenses/>.
 *
 */

#ifdef __cplusplus
extern "C" {
#endif

#if defined(__ARM_NEON__)
  void yuv420_2_rgb8888_neon
  (
    uint8_t *dst_ptr,
    const uint8_t *y_ptr,
    const uint8_t *u_ptr,
    const uint8_t *v_ptr,
    int width,
    int height,
    int y_pitch,
    int uv_pitch,
    int rgb_pitch
  );

  void yuv422_2_rgb8888_neon
  (
    uint8_t *dst_ptr,
    const uint8_t *y_ptr,
    const uint8_t *u_ptr,
    const uint8_t *v_ptr,
    int width,
    int height,
    int y_pitch,
    int uv_pitch,
    int rgb_pitch
  );
#endif

#ifdef __cplusplus
}
#endif
/*#############下面是实现文件，是汇编实现的#########*/


/*
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/src/yuv2rgb.neon.S
*/

// Copyright (c) 2011 ARM Limited. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
//
// http://code.google.com/p/chromium/issues/detail?id=71403
//
//
//
#ifdef __ARM_NEON__

 /* Initial ARM Neon implementation of core YUV2RGB functions. */

    .text
    .align 4
#ifndef __APPLE__
    .global yuv420_2_rgb8888_neon
    .type   yuv420_2_rgb8888_neon, %function
#else
    .globl  _yuv420_2_rgb8888_neon
#endif
#ifndef __APPLE__
    .global yuv422_2_rgb8888_neon
    .type   yuv422_2_rgb8888_neon, %function
#else
    .globl  _yuv422_2_rgb8888_neon
#endif

/* Constants */
#define coef_y         d0
#define coef_v_r       d1
#define coef_u_g       d2
#define coef_v_g       d3
#define coef_u_b       d4
/* D5 is spare */
#define bias_r         q3
#define bias_r_lo      d6
#define bias_r_hi      d7
#define bias_g         q4
#define bias_g_lo      d8
#define bias_g_hi      d9
#define bias_b         q5
#define bias_b_lo      d10
#define bias_b_hi      d11

/* Input data */
#define y_even         d24
#define y_odd          d26
#define u              d16 /*overlaps with q8 - b_delta, but safe */
#define v              d17 /*overlaps with q8 - b_delta, but safe */

/* Chrominance signal for whole 16x2 block */
#define r_delta        q6
#define g_delta        q7
#define b_delta        q8

/* Current group of 8 pixels */
#define red            q9
#define grn            q10
#define blu            q11
#define y_scale        q15

/* output area, in the right order for interleaved output with VST4 */
#define blu8_e         d24 /* overlaps with y_even, but safe */
#define red8_e         d25
#define blu8_o         d26 /* overlaps with y_odd, but safe */
#define red8_o         d27
#define grn8_e         d28
#define alp8_e         d29
#define grn8_o         d30 /* overlaps with q15 - y_scale, but safe */
#define alp8_o         d31 /* overlaps with q15 - y_scale, but safe */

/* ARM registers */
#define rgb_t_ptr      r0
#define y_t_ptr        r1
#define u_ptr          r2
#define v_ptr          r3
#define width          r4
#define height         r5
#define y_pitch        r6
#define uv_pitch       r7
#define rgb_pitch      r8
#define count          r9
#define aligned_count  sl
#define rgb_b_ptr      fp
#define y_b_ptr        ip

/* Constants */
/* 8-bit constants can be loaded into vectors using VMOV */
#define C_Y_SCALE      74   /* Y scale , 74 */
#define C_V_RED        102  /* v -> red coefficient, 102 */
#define C_U_GREEN      25   /* u -> green , -25 */
#define C_V_GREEN      52   /* v -> green , -52 */
#define C_U_BLUE       129  /* u -> blue, +129 */

/* Coefficients */
    .align 4
coefficients:
#coeff_bias_r:
    .short  -14240  /* bias_r = 74 * (-16)                + (102 * -128) */
                    /*          -1,184                    + -13,056      */
#coeff_bias_g:
    .short    8672  /* bias_g = 74 * (-16) -  25 * (-128) - ( 52 * -128) */
                    /*          -1,184     -  -3200       - -6,656       */
#coeff_bias_b:
    .short  -17696  /* bias_b = 74 * (-16) + 129 * (-128)                */
                    /*          -1,184     + -16,512                     */
#coeff_pad:
    .short       0

#ifndef __APPLE__
yuv420_2_rgb8888_neon:
#else
_yuv420_2_rgb8888_neon:
#endif
    /*  r0 = dst_ptr */
    /*  r1 = y_ptr */
    /*  r2 = u_ptr */
    /*  r3 = v_ptr */
    /*  <> = width */
    /*  <> = height */
    /*  <> = y_pitch */
    /*  <> = uv_pitch */
    /*  <> = rgb_pitch */
#ifndef __APPLE__
    .fnstart
#endif
        push            {r4-r12, lr}         /* 10 words */
        vpush           {q4-q7}              /* 4Q -> 16 words */

        ldr             width,  [sp, #26*4]
        ldr             height, [sp, #27*4]
        ldr             y_pitch, [sp, #28*4]
        ldr             uv_pitch, [sp, #29*4]
        ldr             rgb_pitch, [sp, #30*4]
        adr             lr, coefficients

        /* We can't cope with a width less than 16. Check for that. */
        cmp             width, #16
        vpoplt          {q4-q7}
        poplt           {r4-r12, pc}

        /* Load up vectors containing the bias values. */
        vld1.s16        {bias_r_lo[], bias_r_hi[]}, [lr]!
        vld1.s16        {bias_g_lo[], bias_g_hi[]}, [lr]!
        vld1.s16        {bias_b_lo[], bias_b_hi[]}, [lr]!

        /* Build coefficient vectors containing the same value in each element. */
        vmov.u8         coef_y, #C_Y_SCALE
        vmov.u8         coef_v_r, #C_V_RED
        vmov.u8         coef_u_g, #C_U_GREEN
        vmov.u8         coef_v_g, #C_V_GREEN
        vmov.u8         coef_u_b, #C_U_BLUE

loop_v_420:
        add             y_b_ptr, y_t_ptr, y_pitch
        add             rgb_b_ptr, rgb_t_ptr, rgb_pitch
        mov             aligned_count, width

        /* If width is not an integer multiple of 16, run the
           first pass through the loop with the correct number
           of pixels to correct the size for the remaining loops. */
        ands            count, width, #15
        /* If we're already aligned (i.e. count is now 0), set count
           to 16 to run the first loop as normal. */
        moveq           count, #16

loop_h_420:
        /*****************************/
        /* COMMON CODE FOR BOTH ROWS */
        /*****************************/
        /* Load u and v. */
        vld1.u8         v, [v_ptr]
        add             v_ptr, count, ASR #1
        vld1.u8         u, [u_ptr]
        add             u_ptr, count, ASR #1

        /* Calculate contribution from chrominance signals. */
        vmull.u8        r_delta, v, coef_v_r
        vmull.u8        g_delta, u, coef_u_g
        vmlal.u8        g_delta, v, coef_v_g
        vmull.u8        b_delta, u, coef_u_b

        /* add bias. */
        vadd.s16        r_delta, r_delta, bias_r
        vsub.s16        g_delta, bias_g, g_delta
        vadd.s16        b_delta, b_delta, bias_b

        /* Attempt to preload the next set of u and v input data, for
           better performance. */
        pld             [v_ptr]
        pld             [u_ptr]

        /***********/
        /* TOP ROW */
        /***********/
        /* Top row: Load 16 pixels of y, even and odd. */
        vld2.u8         {y_even, y_odd}, [y_t_ptr], count

        /* Top row, even: combine luminance and chrominance. */
        vmull.u8        y_scale, y_even, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Top row, even: set up alpha data. */
        vmov.u8         alp8_e, #0xFF

        /* Top row, even: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_e, red, #6
        vqrshrun.s16    grn8_e, grn, #6
        vqrshrun.s16    blu8_e, blu, #6

        /* Top row: attempt to preload the next set of Y data, for
           better performance. */
        pld             [y_t_ptr]

        /* Top row, even: interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_e, alp8_e
        vzip.u8         blu8_e, grn8_e

        /* Top row, odd: combine luminance and chrominance. */
        vmull.u8        y_scale, y_odd, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Top row, odd: set up alpha data. */
        vmov.u8         alp8_o, #0xFF

        /* Top row, odd: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_o, red, #6
        vqrshrun.s16    blu8_o, blu, #6
        vqrshrun.s16    grn8_o, grn, #6

        /* Top row, odd: interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_o, alp8_o
        vzip.u8         blu8_o, grn8_o

        /* Top row: Store 16 pixels of ARGB32, interleaving even and
           odd. */
        vst4.u16        {blu8_e, red8_e, blu8_o, red8_o}, [rgb_t_ptr]
        add             rgb_t_ptr, count, LSL #1
        vst4.u16        {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_t_ptr]
        add             rgb_t_ptr, count, LSL #1

        /**************/
        /* BOTTOM ROW */
        /**************/
        /* Bottom row: Load 16 pixels of y, even and odd. */
        vld2.u8         {y_even, y_odd}, [y_b_ptr], count

        /* Bottom row, even: combine luminance and chrominance. */
        vmull.u8        y_scale, y_even, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Bottom row, even: set up alpha data. */
        vmov.u8         alp8_e, #0xFF

        /* Bottom row, even: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_e, red, #6
        vqrshrun.s16    blu8_e, blu, #6
        vqrshrun.s16    grn8_e, grn, #6

        /* Bottom row: attempt to preload the next set of Y data, for
           better performance. */
        pld             [y_b_ptr]

        /* Bottom row, even: interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_e, alp8_e
        vzip.u8         blu8_e, grn8_e

        /* Bottom row, odd: combine luminance and chrominance. */
        vmull.u8        y_scale, y_odd, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Bottom row, odd: set up alpha data. */
        vmov.u8         alp8_o, #0xFF

        /* Bottom row, odd: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_o, red, #6
        vqrshrun.s16    blu8_o, blu, #6
        vqrshrun.s16    grn8_o, grn, #6

        /* Bottom row, odd: Interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_o, alp8_o
        vzip.u8         blu8_o, grn8_o

        /* Have we reached the end of the row yet? */
        subs            aligned_count, aligned_count, count

        /* Bottom row: Store 16 pixels of ARGB32, interleaving even and
           odd. */
        vst4.u16        {blu8_e, red8_e, blu8_o, red8_o}, [rgb_b_ptr]
        add             rgb_b_ptr, count, LSL #1
        vst4.u16        {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_b_ptr]
        add             rgb_b_ptr, count, LSL #1

        /* On the second (and subsequent) passes through this code,
           we'll always be working on 16 pixels at once. */
        mov             count, #16
        bgt             loop_h_420

        /* Update pointers for new row of data. */
        sub             rgb_t_ptr, width, LSL #2
        sub             y_t_ptr, width
        sub             u_ptr, width, ASR #1
        sub             v_ptr, width, ASR #1
        add             rgb_t_ptr, rgb_pitch, LSL #1
        add             y_t_ptr, y_pitch, LSL #1
        add             u_ptr, uv_pitch
        add             v_ptr, uv_pitch

        /* Have we reached the bottom row yet? */
        subs            height, height, #2
        bgt             loop_v_420

        vpop            {q4-q7}
        pop             {r4-r12, pc}
#ifndef __APPLE__
        .fnend
#endif

/* Much the same as the above code, but simplified to work on a single
   row at a time. Each U and V value only covers 2 adjacent pixels on
   one row, not a 2x2 matrix */
#define rgb_ptr     rgb_t_ptr
#define y_ptr       y_t_ptr

#ifndef __APPLE__
yuv422_2_rgb8888_neon:
#else
_yuv422_2_rgb8888_neon:
#endif
    /*  r0 = dst_ptr */
    /*  r1 = y_ptr */
    /*  r2 = u_ptr */
    /*  r3 = v_ptr */
    /*  <> = width */
    /*  <> = height */
    /*  <> = y_pitch */
    /*  <> = uv_pitch */
    /*  <> = rgb_pitch */
#ifndef __APPLE__
    .fnstart
#endif
        push            {r4-r12, lr}         /* 10 words */
        vpush           {q4-q7}              /* 4Q -> 16 words */

        ldr             width,  [sp, #26*4]
        ldr             height, [sp, #27*4]
        ldr             y_pitch, [sp, #28*4]
        ldr             uv_pitch, [sp, #29*4]
        ldr             rgb_pitch, [sp, #30*4]
        adr             lr, coefficients

        /* We can't cope with a width less than 16. Check for that. */
        cmp             width, #16
        vpoplt          {q4-q7}
        poplt           {r4-r12, pc}

        /* Load up vectors containing the bias values. */
        vld1.s16        {bias_r_lo[], bias_r_hi[]}, [lr]!
        vld1.s16        {bias_g_lo[], bias_g_hi[]}, [lr]!
        vld1.s16        {bias_b_lo[], bias_b_hi[]}, [lr]!

        /* Build coefficient vectors containing the same value in each element. */
        vmov.u8         coef_y, #C_Y_SCALE
        vmov.u8         coef_v_r, #C_V_RED
        vmov.u8         coef_u_g, #C_U_GREEN
        vmov.u8         coef_v_g, #C_V_GREEN
        vmov.u8         coef_u_b, #C_U_BLUE

loop_v_422:
        mov             aligned_count, width
        /* If width is not an integer multiple of 16, run the
           first pass through the loop with the correct number
           of pixels to correct the size for the remaining loops. */
        ands            count, width, #15
        /* If we're already aligned (i.e. count is now 0), set count
           to 16 to run the first loop as normal. */
        moveq           count, #16

loop_h_422:
        /* Load u and v. */
        vld1.u8         v, [v_ptr]
        add             v_ptr, count, ASR #1
        vld1.u8         u, [u_ptr]
        add             u_ptr, count, ASR #1

        /* Calculate contribution from chrominance signals. */
        vmull.u8        r_delta, v, coef_v_r
        vmull.u8        g_delta, u, coef_u_g
        vmlal.u8        g_delta, v, coef_v_g
        vmull.u8        b_delta, u, coef_u_b

        /* Attempt to preload the next set of u and v input data, for
           better performance. */
        pld             [v_ptr]
        pld             [u_ptr]

        /* Load 16 pixels of y, even and odd. */
        vld2.u8         {y_even, y_odd}, [y_ptr], count

        /* Add bias. */
        vadd.s16        r_delta, r_delta, bias_r
        vsub.s16        g_delta, bias_g, g_delta
        vadd.s16        b_delta, b_delta, bias_b

        /* Even: combine luminance and chrominance. */
        vmull.u8        y_scale, y_even, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Even: set up alpha data. */
        vmov.u8         alp8_e, #0xFF

        /* Attempt to preload the next set of Y data, for better
           performance. */
        pld             [y_ptr]

        /* Even: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_e, red, #6
        vqrshrun.s16    grn8_e, grn, #6
        vqrshrun.s16    blu8_e, blu, #6

        /* Even: Interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_e, alp8_e
        vzip.u8         blu8_e, grn8_e

        /* Odd: combine luminance and chrominance. */
        vmull.u8        y_scale, y_odd, coef_y
        vqadd.s16       red, y_scale, r_delta
        vqadd.s16       grn, y_scale, g_delta
        vqadd.s16       blu, y_scale, b_delta

        /* Odd: set up alpha data. */
        vmov.u8         alp8_o, #0xFF

        /* Odd: clamp, rescale and clip colour components to 8 bits. */
        vqrshrun.s16    red8_o, red, #6
        vqrshrun.s16    blu8_o, blu, #6
        vqrshrun.s16    grn8_o, grn, #6

        /* Odd: Interleave the colour and alpha components
           ready for storage. */
        vzip.u8         red8_o, alp8_o
        vzip.u8         blu8_o, grn8_o

        /* Have we reached the end of the row yet? */
        subs            aligned_count, aligned_count, count

        /* Store 16 pixels of ARGB32, interleaving even and odd. */
        vst4.u16        {blu8_e, red8_e, blu8_o, red8_o}, [rgb_ptr]
        add             rgb_ptr, count, LSL #1
        vst4.u16        {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_ptr]
        add             rgb_ptr, count, LSL #1

        /* On the second (and subsequent) passes through this code,
           we'll always be working on 16 pixels at once. */
        mov             count, #16
        bgt             loop_h_422

        /* Update pointers for new row of data. */
        sub             rgb_ptr, width, LSL #2
        sub             y_ptr, width
        sub             u_ptr, width, ASR #1
        sub             v_ptr, width, ASR #1
        add             rgb_ptr, rgb_pitch
        add             y_ptr, y_pitch
        add             u_ptr, uv_pitch
        add             v_ptr, uv_pitch

        /* Have we reached the bottom yet? */
        subs            height, height, #1
        bgt             loop_v_422

        vpop            {q4-q7}
        pop             {r4-r12, pc}
#ifndef __APPLE__
        .fnend
#endif

#endif /* __ARM_NEON__ */
	/**这个是编译文件，可以直接把S文件编译为动态库
	https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/Android.mk
	*/
	LOCAL_PATH := $(call my-dir)

	#the yuv2rgb library
	include $(CLEAR_VARS)
	LOCAL_ALLOW_UNDEFINED_SYMBOLS=false
	LOCAL_MODULE := yuv2rgb

	LOCAL_CFLAGS := -I$(AVPLAYER_PATH)"/jni/yuv2rgb/include" -D__STDC_CONSTANT_MACROS
	LOCAL_CFLAGS += $(CC_OPTIMIZE_FLAG)

	LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.S src/yuv420rgb565.S
	#LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.s src/yuv420rgb565.s src/yuv422rgb565.s src/yuv2rgb555.s src/yuv2rgbX.s src/yuv420rgb888.s src/yuv422rgb565.s src/yuv422rgb888.s src/yuv422rgb8888.s src/yuv444rgb565.s src/yuv444rgb888.s src/yuv444rgb8888.s

	ifeq ($(TARGET_ARCH_ABI),x86)
	LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c
	endif

	ifeq ($(TARGET_ARCH_ABI),mips)
	LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c
	endif

	LOCAL_SHARED_LIBRARIES :=
	LOCAL_STATIC_LIBRARIES :=
	LOCAL_LDLIBS := -ldl -llog

	include $(BUILD_SHARED_LIBRARY)

	/*##########这个是头文件#################
	https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/include/yuv2rgb.neon.h
	*/
	/*
	* Copyright (C) 2005-2013 Team XBMC
	* http://xbmc.org
	*
	* This Program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2, or (at your option)
	* any later version.
	*
	* This Program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with XBMC; see the file COPYING. If not, see
	* <http://www.gnu.org/licenses/>.
	*
	*/

	#ifdef __cplusplus
	extern "C" {
	#endif

	#if defined(__ARM_NEON__)
	void yuv420_2_rgb8888_neon
	(
	uint8_t *dst_ptr,
	const uint8_t *y_ptr,
	const uint8_t *u_ptr,
	const uint8_t *v_ptr,
	int width,
	int height,
	int y_pitch,
	int uv_pitch,
	int rgb_pitch
	);

	void yuv422_2_rgb8888_neon
	(
	uint8_t *dst_ptr,
	const uint8_t *y_ptr,
	const uint8_t *u_ptr,
	const uint8_t *v_ptr,
	int width,
	int height,
	int y_pitch,
	int uv_pitch,
	int rgb_pitch
	);
	#endif

	#ifdef __cplusplus
	}
	#endif
	/#############下面是实现文件，是汇编实现的#########/




	/*
	https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/src/yuv2rgb.neon.S
	*/

	// Copyright (c) 2011 ARM Limited. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	//
	//
	// http://code.google.com/p/chromium/issues/detail?id=71403
	//
	//
	//
	#ifdef __ARM_NEON__

	/* Initial ARM Neon implementation of core YUV2RGB functions. */

	.text
	.align 4
	#ifndef __APPLE__
	.global yuv420_2_rgb8888_neon
	.type yuv420_2_rgb8888_neon, %function
	#else
	.globl _yuv420_2_rgb8888_neon
	#endif
	#ifndef __APPLE__
	.global yuv422_2_rgb8888_neon
	.type yuv422_2_rgb8888_neon, %function
	#else
	.globl _yuv422_2_rgb8888_neon
	#endif

	/* Constants */
	#define coef_y d0
	#define coef_v_r d1
	#define coef_u_g d2
	#define coef_v_g d3
	#define coef_u_b d4
	/* D5 is spare */
	#define bias_r q3
	#define bias_r_lo d6
	#define bias_r_hi d7
	#define bias_g q4
	#define bias_g_lo d8
	#define bias_g_hi d9
	#define bias_b q5
	#define bias_b_lo d10
	#define bias_b_hi d11

	/* Input data */
	#define y_even d24
	#define y_odd d26
	#define u d16 /overlaps with q8 - b_delta, but safe /
	#define v d17 /overlaps with q8 - b_delta, but safe /

	/* Chrominance signal for whole 16x2 block */
	#define r_delta q6
	#define g_delta q7
	#define b_delta q8

	/* Current group of 8 pixels */
	#define red q9
	#define grn q10
	#define blu q11
	#define y_scale q15

	/* output area, in the right order for interleaved output with VST4 */
	#define blu8_e d24 /* overlaps with y_even, but safe */
	#define red8_e d25
	#define blu8_o d26 /* overlaps with y_odd, but safe */
	#define red8_o d27
	#define grn8_e d28
	#define alp8_e d29
	#define grn8_o d30 /* overlaps with q15 - y_scale, but safe */
	#define alp8_o d31 /* overlaps with q15 - y_scale, but safe */

	/* ARM registers */
	#define rgb_t_ptr r0
	#define y_t_ptr r1
	#define u_ptr r2
	#define v_ptr r3
	#define width r4
	#define height r5
	#define y_pitch r6
	#define uv_pitch r7
	#define rgb_pitch r8
	#define count r9
	#define aligned_count sl
	#define rgb_b_ptr fp
	#define y_b_ptr ip

	/* Constants */
	/* 8-bit constants can be loaded into vectors using VMOV */
	#define C_Y_SCALE 74 /* Y scale , 74 */
	#define C_V_RED 102 /* v -> red coefficient, 102 */
	#define C_U_GREEN 25 /* u -> green , -25 */
	#define C_V_GREEN 52 /* v -> green , -52 */
	#define C_U_BLUE 129 /* u -> blue, +129 */

	/* Coefficients */
	.align 4
	coefficients:
	#coeff_bias_r:
	.short -14240 /* bias_r = 74 * (-16) + (102 * -128) */
	/* -1,184 + -13,056 */
	#coeff_bias_g:
	.short 8672 /* bias_g = 74 * (-16) - 25 * (-128) - ( 52 * -128) */
	/* -1,184 - -3200 - -6,656 */
	#coeff_bias_b:
	.short -17696 /* bias_b = 74 * (-16) + 129 * (-128) */
	/* -1,184 + -16,512 */
	#coeff_pad:
	.short 0

	#ifndef __APPLE__
	yuv420_2_rgb8888_neon:
	#else
	_yuv420_2_rgb8888_neon:
	#endif
	/* r0 = dst_ptr */
	/* r1 = y_ptr */
	/* r2 = u_ptr */
	/* r3 = v_ptr */
	/* <> = width */
	/* <> = height */
	/* <> = y_pitch */
	/* <> = uv_pitch */
	/* <> = rgb_pitch */
	#ifndef __APPLE__
	.fnstart
	#endif
	push {r4-r12, lr} /* 10 words */
	vpush {q4-q7} /* 4Q -> 16 words */

	ldr width, [sp, #26*4]
	ldr height, [sp, #27*4]
	ldr y_pitch, [sp, #28*4]
	ldr uv_pitch, [sp, #29*4]
	ldr rgb_pitch, [sp, #30*4]
	adr lr, coefficients

	/* We can't cope with a width less than 16. Check for that. */
	cmp width, #16
	vpoplt {q4-q7}
	poplt {r4-r12, pc}

	/* Load up vectors containing the bias values. */
	vld1.s16 {bias_r_lo[], bias_r_hi[]}, [lr]!
	vld1.s16 {bias_g_lo[], bias_g_hi[]}, [lr]!
	vld1.s16 {bias_b_lo[], bias_b_hi[]}, [lr]!

	/* Build coefficient vectors containing the same value in each element. */
	vmov.u8 coef_y, #C_Y_SCALE
	vmov.u8 coef_v_r, #C_V_RED
	vmov.u8 coef_u_g, #C_U_GREEN
	vmov.u8 coef_v_g, #C_V_GREEN
	vmov.u8 coef_u_b, #C_U_BLUE

	loop_v_420:
	add y_b_ptr, y_t_ptr, y_pitch
	add rgb_b_ptr, rgb_t_ptr, rgb_pitch
	mov aligned_count, width

	/* If width is not an integer multiple of 16, run the
	first pass through the loop with the correct number
	of pixels to correct the size for the remaining loops. */
	ands count, width, #15
	/* If we're already aligned (i.e. count is now 0), set count
	to 16 to run the first loop as normal. */
	moveq count, #16

	loop_h_420:
	/*****************************/
	/* COMMON CODE FOR BOTH ROWS */
	/*****************************/
	/* Load u and v. */
	vld1.u8 v, [v_ptr]
	add v_ptr, count, ASR #1
	vld1.u8 u, [u_ptr]
	add u_ptr, count, ASR #1

	/* Calculate contribution from chrominance signals. */
	vmull.u8 r_delta, v, coef_v_r
	vmull.u8 g_delta, u, coef_u_g
	vmlal.u8 g_delta, v, coef_v_g
	vmull.u8 b_delta, u, coef_u_b

	/* add bias. */
	vadd.s16 r_delta, r_delta, bias_r
	vsub.s16 g_delta, bias_g, g_delta
	vadd.s16 b_delta, b_delta, bias_b

	/* Attempt to preload the next set of u and v input data, for
	better performance. */
	pld [v_ptr]
	pld [u_ptr]

	/***********/
	/* TOP ROW */
	/***********/
	/* Top row: Load 16 pixels of y, even and odd. */
	vld2.u8 {y_even, y_odd}, [y_t_ptr], count

	/* Top row, even: combine luminance and chrominance. */
	vmull.u8 y_scale, y_even, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Top row, even: set up alpha data. */
	vmov.u8 alp8_e, #0xFF

	/* Top row, even: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_e, red, #6
	vqrshrun.s16 grn8_e, grn, #6
	vqrshrun.s16 blu8_e, blu, #6

	/* Top row: attempt to preload the next set of Y data, for
	better performance. */
	pld [y_t_ptr]

	/* Top row, even: interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_e, alp8_e
	vzip.u8 blu8_e, grn8_e

	/* Top row, odd: combine luminance and chrominance. */
	vmull.u8 y_scale, y_odd, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Top row, odd: set up alpha data. */
	vmov.u8 alp8_o, #0xFF

	/* Top row, odd: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_o, red, #6
	vqrshrun.s16 blu8_o, blu, #6
	vqrshrun.s16 grn8_o, grn, #6

	/* Top row, odd: interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_o, alp8_o
	vzip.u8 blu8_o, grn8_o

	/* Top row: Store 16 pixels of ARGB32, interleaving even and
	odd. */
	vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_t_ptr]
	add rgb_t_ptr, count, LSL #1
	vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_t_ptr]
	add rgb_t_ptr, count, LSL #1

	/**************/
	/* BOTTOM ROW */
	/**************/
	/* Bottom row: Load 16 pixels of y, even and odd. */
	vld2.u8 {y_even, y_odd}, [y_b_ptr], count

	/* Bottom row, even: combine luminance and chrominance. */
	vmull.u8 y_scale, y_even, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Bottom row, even: set up alpha data. */
	vmov.u8 alp8_e, #0xFF

	/* Bottom row, even: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_e, red, #6
	vqrshrun.s16 blu8_e, blu, #6
	vqrshrun.s16 grn8_e, grn, #6

	/* Bottom row: attempt to preload the next set of Y data, for
	better performance. */
	pld [y_b_ptr]

	/* Bottom row, even: interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_e, alp8_e
	vzip.u8 blu8_e, grn8_e

	/* Bottom row, odd: combine luminance and chrominance. */
	vmull.u8 y_scale, y_odd, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Bottom row, odd: set up alpha data. */
	vmov.u8 alp8_o, #0xFF

	/* Bottom row, odd: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_o, red, #6
	vqrshrun.s16 blu8_o, blu, #6
	vqrshrun.s16 grn8_o, grn, #6

	/* Bottom row, odd: Interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_o, alp8_o
	vzip.u8 blu8_o, grn8_o

	/* Have we reached the end of the row yet? */
	subs aligned_count, aligned_count, count

	/* Bottom row: Store 16 pixels of ARGB32, interleaving even and
	odd. */
	vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_b_ptr]
	add rgb_b_ptr, count, LSL #1
	vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_b_ptr]
	add rgb_b_ptr, count, LSL #1

	/* On the second (and subsequent) passes through this code,
	we'll always be working on 16 pixels at once. */
	mov count, #16
	bgt loop_h_420

	/* Update pointers for new row of data. */
	sub rgb_t_ptr, width, LSL #2
	sub y_t_ptr, width
	sub u_ptr, width, ASR #1
	sub v_ptr, width, ASR #1
	add rgb_t_ptr, rgb_pitch, LSL #1
	add y_t_ptr, y_pitch, LSL #1
	add u_ptr, uv_pitch
	add v_ptr, uv_pitch

	/* Have we reached the bottom row yet? */
	subs height, height, #2
	bgt loop_v_420

	vpop {q4-q7}
	pop {r4-r12, pc}
	#ifndef __APPLE__
	.fnend
	#endif

	/* Much the same as the above code, but simplified to work on a single
	row at a time. Each U and V value only covers 2 adjacent pixels on
	one row, not a 2x2 matrix */
	#define rgb_ptr rgb_t_ptr
	#define y_ptr y_t_ptr

	#ifndef __APPLE__
	yuv422_2_rgb8888_neon:
	#else
	_yuv422_2_rgb8888_neon:
	#endif
	/* r0 = dst_ptr */
	/* r1 = y_ptr */
	/* r2 = u_ptr */
	/* r3 = v_ptr */
	/* <> = width */
	/* <> = height */
	/* <> = y_pitch */
	/* <> = uv_pitch */
	/* <> = rgb_pitch */
	#ifndef __APPLE__
	.fnstart
	#endif
	push {r4-r12, lr} /* 10 words */
	vpush {q4-q7} /* 4Q -> 16 words */

	ldr width, [sp, #26*4]
	ldr height, [sp, #27*4]
	ldr y_pitch, [sp, #28*4]
	ldr uv_pitch, [sp, #29*4]
	ldr rgb_pitch, [sp, #30*4]
	adr lr, coefficients

	/* We can't cope with a width less than 16. Check for that. */
	cmp width, #16
	vpoplt {q4-q7}
	poplt {r4-r12, pc}

	/* Load up vectors containing the bias values. */
	vld1.s16 {bias_r_lo[], bias_r_hi[]}, [lr]!
	vld1.s16 {bias_g_lo[], bias_g_hi[]}, [lr]!
	vld1.s16 {bias_b_lo[], bias_b_hi[]}, [lr]!

	/* Build coefficient vectors containing the same value in each element. */
	vmov.u8 coef_y, #C_Y_SCALE
	vmov.u8 coef_v_r, #C_V_RED
	vmov.u8 coef_u_g, #C_U_GREEN
	vmov.u8 coef_v_g, #C_V_GREEN
	vmov.u8 coef_u_b, #C_U_BLUE

	loop_v_422:
	mov aligned_count, width
	/* If width is not an integer multiple of 16, run the
	first pass through the loop with the correct number
	of pixels to correct the size for the remaining loops. */
	ands count, width, #15
	/* If we're already aligned (i.e. count is now 0), set count
	to 16 to run the first loop as normal. */
	moveq count, #16

	loop_h_422:
	/* Load u and v. */
	vld1.u8 v, [v_ptr]
	add v_ptr, count, ASR #1
	vld1.u8 u, [u_ptr]
	add u_ptr, count, ASR #1

	/* Calculate contribution from chrominance signals. */
	vmull.u8 r_delta, v, coef_v_r
	vmull.u8 g_delta, u, coef_u_g
	vmlal.u8 g_delta, v, coef_v_g
	vmull.u8 b_delta, u, coef_u_b

	/* Attempt to preload the next set of u and v input data, for
	better performance. */
	pld [v_ptr]
	pld [u_ptr]

	/* Load 16 pixels of y, even and odd. */
	vld2.u8 {y_even, y_odd}, [y_ptr], count

	/* Add bias. */
	vadd.s16 r_delta, r_delta, bias_r
	vsub.s16 g_delta, bias_g, g_delta
	vadd.s16 b_delta, b_delta, bias_b

	/* Even: combine luminance and chrominance. */
	vmull.u8 y_scale, y_even, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Even: set up alpha data. */
	vmov.u8 alp8_e, #0xFF

	/* Attempt to preload the next set of Y data, for better
	performance. */
	pld [y_ptr]

	/* Even: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_e, red, #6
	vqrshrun.s16 grn8_e, grn, #6
	vqrshrun.s16 blu8_e, blu, #6

	/* Even: Interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_e, alp8_e
	vzip.u8 blu8_e, grn8_e

	/* Odd: combine luminance and chrominance. */
	vmull.u8 y_scale, y_odd, coef_y
	vqadd.s16 red, y_scale, r_delta
	vqadd.s16 grn, y_scale, g_delta
	vqadd.s16 blu, y_scale, b_delta

	/* Odd: set up alpha data. */
	vmov.u8 alp8_o, #0xFF

	/* Odd: clamp, rescale and clip colour components to 8 bits. */
	vqrshrun.s16 red8_o, red, #6
	vqrshrun.s16 blu8_o, blu, #6
	vqrshrun.s16 grn8_o, grn, #6

	/* Odd: Interleave the colour and alpha components
	ready for storage. */
	vzip.u8 red8_o, alp8_o
	vzip.u8 blu8_o, grn8_o

	/* Have we reached the end of the row yet? */
	subs aligned_count, aligned_count, count

	/* Store 16 pixels of ARGB32, interleaving even and odd. */
	vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_ptr]
	add rgb_ptr, count, LSL #1
	vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_ptr]
	add rgb_ptr, count, LSL #1

	/* On the second (and subsequent) passes through this code,
	we'll always be working on 16 pixels at once. */
	mov count, #16
	bgt loop_h_422

	/* Update pointers for new row of data. */
	sub rgb_ptr, width, LSL #2
	sub y_ptr, width
	sub u_ptr, width, ASR #1
	sub v_ptr, width, ASR #1
	add rgb_ptr, rgb_pitch
	add y_ptr, y_pitch
	add u_ptr, uv_pitch
	add v_ptr, uv_pitch

	/* Have we reached the bottom yet? */
	subs height, height, #1
	bgt loop_v_422

	vpop {q4-q7}
	pop {r4-r12, pc}
	#ifndef __APPLE__
	.fnend
	#endif

	#endif /* __ARM_NEON__ */