Last active
October 18, 2015 15:26
-
-
Save commshare/c5f3e922ff74adfe4cf7 to your computer and use it in GitHub Desktop.
yuv422_2_rgb8888_neon yuv420_2_rgb8888_neon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/**这个是编译文件,可以直接把S文件编译为动态库 | |
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/Android.mk | |
*/ | |
LOCAL_PATH := $(call my-dir) | |
#the yuv2rgb library | |
include $(CLEAR_VARS) | |
LOCAL_ALLOW_UNDEFINED_SYMBOLS=false | |
LOCAL_MODULE := yuv2rgb | |
LOCAL_CFLAGS := -I$(AVPLAYER_PATH)"/jni/yuv2rgb/include" -D__STDC_CONSTANT_MACROS | |
LOCAL_CFLAGS += $(CC_OPTIMIZE_FLAG) | |
LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.S src/yuv420rgb565.S | |
#LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888.s src/yuv420rgb565.s src/yuv422rgb565.s src/yuv2rgb555.s src/yuv2rgbX.s src/yuv420rgb888.s src/yuv422rgb565.s src/yuv422rgb888.s src/yuv422rgb8888.s src/yuv444rgb565.s src/yuv444rgb888.s src/yuv444rgb8888.s | |
ifeq ($(TARGET_ARCH_ABI),x86) | |
LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c | |
endif | |
ifeq ($(TARGET_ARCH_ABI),mips) | |
LOCAL_SRC_FILES := src/yuv2rgb16tab.c src/yuv420rgb8888c.c src/yuv420rgb565c.c | |
endif | |
LOCAL_SHARED_LIBRARIES := | |
LOCAL_STATIC_LIBRARIES := | |
LOCAL_LDLIBS := -ldl -llog | |
include $(BUILD_SHARED_LIBRARY) | |
/*##########这个是头文件################# | |
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/include/yuv2rgb.neon.h | |
*/ | |
/* | |
* Copyright (C) 2005-2013 Team XBMC | |
* http://xbmc.org | |
* | |
* This Program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2, or (at your option) | |
* any later version. | |
* | |
* This Program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with XBMC; see the file COPYING. If not, see | |
* <http://www.gnu.org/licenses/>. | |
* | |
*/ | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#if defined(__ARM_NEON__) | |
void yuv420_2_rgb8888_neon | |
( | |
uint8_t *dst_ptr, | |
const uint8_t *y_ptr, | |
const uint8_t *u_ptr, | |
const uint8_t *v_ptr, | |
int width, | |
int height, | |
int y_pitch, | |
int uv_pitch, | |
int rgb_pitch | |
); | |
void yuv422_2_rgb8888_neon | |
( | |
uint8_t *dst_ptr, | |
const uint8_t *y_ptr, | |
const uint8_t *u_ptr, | |
const uint8_t *v_ptr, | |
int width, | |
int height, | |
int y_pitch, | |
int uv_pitch, | |
int rgb_pitch | |
); | |
#endif | |
#ifdef __cplusplus | |
} | |
#endif | |
/*#############下面是实现文件,是汇编实现的#########*/ | |
/* | |
https://github.com/jasonchuang/CameraStreamer/blob/master/jni/yuv2rgb/src/yuv2rgb.neon.S | |
*/ | |
// Copyright (c) 2011 ARM Limited. All rights reserved. | |
// Use of this source code is governed by a BSD-style license that can be | |
// found in the LICENSE file. | |
// | |
// | |
// http://code.google.com/p/chromium/issues/detail?id=71403 | |
// | |
// | |
// | |
#ifdef __ARM_NEON__ | |
/* Initial ARM Neon implementation of core YUV2RGB functions. */ | |
.text | |
.align 4 | |
#ifndef __APPLE__ | |
.global yuv420_2_rgb8888_neon | |
.type yuv420_2_rgb8888_neon, %function | |
#else | |
.globl _yuv420_2_rgb8888_neon | |
#endif | |
#ifndef __APPLE__ | |
.global yuv422_2_rgb8888_neon | |
.type yuv422_2_rgb8888_neon, %function | |
#else | |
.globl _yuv422_2_rgb8888_neon | |
#endif | |
/* Constants */ | |
#define coef_y d0 | |
#define coef_v_r d1 | |
#define coef_u_g d2 | |
#define coef_v_g d3 | |
#define coef_u_b d4 | |
/* D5 is spare */ | |
#define bias_r q3 | |
#define bias_r_lo d6 | |
#define bias_r_hi d7 | |
#define bias_g q4 | |
#define bias_g_lo d8 | |
#define bias_g_hi d9 | |
#define bias_b q5 | |
#define bias_b_lo d10 | |
#define bias_b_hi d11 | |
/* Input data */ | |
#define y_even d24 | |
#define y_odd d26 | |
#define u d16 /*overlaps with q8 - b_delta, but safe */ | |
#define v d17 /*overlaps with q8 - b_delta, but safe */ | |
/* Chrominance signal for whole 16x2 block */ | |
#define r_delta q6 | |
#define g_delta q7 | |
#define b_delta q8 | |
/* Current group of 8 pixels */ | |
#define red q9 | |
#define grn q10 | |
#define blu q11 | |
#define y_scale q15 | |
/* output area, in the right order for interleaved output with VST4 */ | |
#define blu8_e d24 /* overlaps with y_even, but safe */ | |
#define red8_e d25 | |
#define blu8_o d26 /* overlaps with y_odd, but safe */ | |
#define red8_o d27 | |
#define grn8_e d28 | |
#define alp8_e d29 | |
#define grn8_o d30 /* overlaps with q15 - y_scale, but safe */ | |
#define alp8_o d31 /* overlaps with q15 - y_scale, but safe */ | |
/* ARM registers */ | |
#define rgb_t_ptr r0 | |
#define y_t_ptr r1 | |
#define u_ptr r2 | |
#define v_ptr r3 | |
#define width r4 | |
#define height r5 | |
#define y_pitch r6 | |
#define uv_pitch r7 | |
#define rgb_pitch r8 | |
#define count r9 | |
#define aligned_count sl | |
#define rgb_b_ptr fp | |
#define y_b_ptr ip | |
/* Constants */ | |
/* 8-bit constants can be loaded into vectors using VMOV */ | |
#define C_Y_SCALE 74 /* Y scale , 74 */ | |
#define C_V_RED 102 /* v -> red coefficient, 102 */ | |
#define C_U_GREEN 25 /* u -> green , -25 */ | |
#define C_V_GREEN 52 /* v -> green , -52 */ | |
#define C_U_BLUE 129 /* u -> blue, +129 */ | |
/* Coefficients */ | |
.align 4 | |
coefficients: | |
#coeff_bias_r: | |
.short -14240 /* bias_r = 74 * (-16) + (102 * -128) */ | |
/* -1,184 + -13,056 */ | |
#coeff_bias_g: | |
.short 8672 /* bias_g = 74 * (-16) - 25 * (-128) - ( 52 * -128) */ | |
/* -1,184 - -3200 - -6,656 */ | |
#coeff_bias_b: | |
.short -17696 /* bias_b = 74 * (-16) + 129 * (-128) */ | |
/* -1,184 + -16,512 */ | |
#coeff_pad: | |
.short 0 | |
#ifndef __APPLE__ | |
yuv420_2_rgb8888_neon: | |
#else | |
_yuv420_2_rgb8888_neon: | |
#endif | |
/* r0 = dst_ptr */ | |
/* r1 = y_ptr */ | |
/* r2 = u_ptr */ | |
/* r3 = v_ptr */ | |
/* <> = width */ | |
/* <> = height */ | |
/* <> = y_pitch */ | |
/* <> = uv_pitch */ | |
/* <> = rgb_pitch */ | |
#ifndef __APPLE__ | |
.fnstart | |
#endif | |
push {r4-r12, lr} /* 10 words */ | |
vpush {q4-q7} /* 4Q -> 16 words */ | |
ldr width, [sp, #26*4] | |
ldr height, [sp, #27*4] | |
ldr y_pitch, [sp, #28*4] | |
ldr uv_pitch, [sp, #29*4] | |
ldr rgb_pitch, [sp, #30*4] | |
adr lr, coefficients | |
/* We can't cope with a width less than 16. Check for that. */ | |
cmp width, #16 | |
vpoplt {q4-q7} | |
poplt {r4-r12, pc} | |
/* Load up vectors containing the bias values. */ | |
vld1.s16 {bias_r_lo[], bias_r_hi[]}, [lr]! | |
vld1.s16 {bias_g_lo[], bias_g_hi[]}, [lr]! | |
vld1.s16 {bias_b_lo[], bias_b_hi[]}, [lr]! | |
/* Build coefficient vectors containing the same value in each element. */ | |
vmov.u8 coef_y, #C_Y_SCALE | |
vmov.u8 coef_v_r, #C_V_RED | |
vmov.u8 coef_u_g, #C_U_GREEN | |
vmov.u8 coef_v_g, #C_V_GREEN | |
vmov.u8 coef_u_b, #C_U_BLUE | |
loop_v_420: | |
add y_b_ptr, y_t_ptr, y_pitch | |
add rgb_b_ptr, rgb_t_ptr, rgb_pitch | |
mov aligned_count, width | |
/* If width is not an integer multiple of 16, run the | |
first pass through the loop with the correct number | |
of pixels to correct the size for the remaining loops. */ | |
ands count, width, #15 | |
/* If we're already aligned (i.e. count is now 0), set count | |
to 16 to run the first loop as normal. */ | |
moveq count, #16 | |
loop_h_420: | |
/*****************************/ | |
/* COMMON CODE FOR BOTH ROWS */ | |
/*****************************/ | |
/* Load u and v. */ | |
vld1.u8 v, [v_ptr] | |
add v_ptr, count, ASR #1 | |
vld1.u8 u, [u_ptr] | |
add u_ptr, count, ASR #1 | |
/* Calculate contribution from chrominance signals. */ | |
vmull.u8 r_delta, v, coef_v_r | |
vmull.u8 g_delta, u, coef_u_g | |
vmlal.u8 g_delta, v, coef_v_g | |
vmull.u8 b_delta, u, coef_u_b | |
/* add bias. */ | |
vadd.s16 r_delta, r_delta, bias_r | |
vsub.s16 g_delta, bias_g, g_delta | |
vadd.s16 b_delta, b_delta, bias_b | |
/* Attempt to preload the next set of u and v input data, for | |
better performance. */ | |
pld [v_ptr] | |
pld [u_ptr] | |
/***********/ | |
/* TOP ROW */ | |
/***********/ | |
/* Top row: Load 16 pixels of y, even and odd. */ | |
vld2.u8 {y_even, y_odd}, [y_t_ptr], count | |
/* Top row, even: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_even, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Top row, even: set up alpha data. */ | |
vmov.u8 alp8_e, #0xFF | |
/* Top row, even: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_e, red, #6 | |
vqrshrun.s16 grn8_e, grn, #6 | |
vqrshrun.s16 blu8_e, blu, #6 | |
/* Top row: attempt to preload the next set of Y data, for | |
better performance. */ | |
pld [y_t_ptr] | |
/* Top row, even: interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_e, alp8_e | |
vzip.u8 blu8_e, grn8_e | |
/* Top row, odd: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_odd, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Top row, odd: set up alpha data. */ | |
vmov.u8 alp8_o, #0xFF | |
/* Top row, odd: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_o, red, #6 | |
vqrshrun.s16 blu8_o, blu, #6 | |
vqrshrun.s16 grn8_o, grn, #6 | |
/* Top row, odd: interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_o, alp8_o | |
vzip.u8 blu8_o, grn8_o | |
/* Top row: Store 16 pixels of ARGB32, interleaving even and | |
odd. */ | |
vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_t_ptr] | |
add rgb_t_ptr, count, LSL #1 | |
vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_t_ptr] | |
add rgb_t_ptr, count, LSL #1 | |
/**************/ | |
/* BOTTOM ROW */ | |
/**************/ | |
/* Bottom row: Load 16 pixels of y, even and odd. */ | |
vld2.u8 {y_even, y_odd}, [y_b_ptr], count | |
/* Bottom row, even: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_even, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Bottom row, even: set up alpha data. */ | |
vmov.u8 alp8_e, #0xFF | |
/* Bottom row, even: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_e, red, #6 | |
vqrshrun.s16 blu8_e, blu, #6 | |
vqrshrun.s16 grn8_e, grn, #6 | |
/* Bottom row: attempt to preload the next set of Y data, for | |
better performance. */ | |
pld [y_b_ptr] | |
/* Bottom row, even: interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_e, alp8_e | |
vzip.u8 blu8_e, grn8_e | |
/* Bottom row, odd: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_odd, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Bottom row, odd: set up alpha data. */ | |
vmov.u8 alp8_o, #0xFF | |
/* Bottom row, odd: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_o, red, #6 | |
vqrshrun.s16 blu8_o, blu, #6 | |
vqrshrun.s16 grn8_o, grn, #6 | |
/* Bottom row, odd: Interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_o, alp8_o | |
vzip.u8 blu8_o, grn8_o | |
/* Have we reached the end of the row yet? */ | |
subs aligned_count, aligned_count, count | |
/* Bottom row: Store 16 pixels of ARGB32, interleaving even and | |
odd. */ | |
vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_b_ptr] | |
add rgb_b_ptr, count, LSL #1 | |
vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_b_ptr] | |
add rgb_b_ptr, count, LSL #1 | |
/* On the second (and subsequent) passes through this code, | |
we'll always be working on 16 pixels at once. */ | |
mov count, #16 | |
bgt loop_h_420 | |
/* Update pointers for new row of data. */ | |
sub rgb_t_ptr, width, LSL #2 | |
sub y_t_ptr, width | |
sub u_ptr, width, ASR #1 | |
sub v_ptr, width, ASR #1 | |
add rgb_t_ptr, rgb_pitch, LSL #1 | |
add y_t_ptr, y_pitch, LSL #1 | |
add u_ptr, uv_pitch | |
add v_ptr, uv_pitch | |
/* Have we reached the bottom row yet? */ | |
subs height, height, #2 | |
bgt loop_v_420 | |
vpop {q4-q7} | |
pop {r4-r12, pc} | |
#ifndef __APPLE__ | |
.fnend | |
#endif | |
/* Much the same as the above code, but simplified to work on a single | |
row at a time. Each U and V value only covers 2 adjacent pixels on | |
one row, not a 2x2 matrix */ | |
#define rgb_ptr rgb_t_ptr | |
#define y_ptr y_t_ptr | |
#ifndef __APPLE__ | |
yuv422_2_rgb8888_neon: | |
#else | |
_yuv422_2_rgb8888_neon: | |
#endif | |
/* r0 = dst_ptr */ | |
/* r1 = y_ptr */ | |
/* r2 = u_ptr */ | |
/* r3 = v_ptr */ | |
/* <> = width */ | |
/* <> = height */ | |
/* <> = y_pitch */ | |
/* <> = uv_pitch */ | |
/* <> = rgb_pitch */ | |
#ifndef __APPLE__ | |
.fnstart | |
#endif | |
push {r4-r12, lr} /* 10 words */ | |
vpush {q4-q7} /* 4Q -> 16 words */ | |
ldr width, [sp, #26*4] | |
ldr height, [sp, #27*4] | |
ldr y_pitch, [sp, #28*4] | |
ldr uv_pitch, [sp, #29*4] | |
ldr rgb_pitch, [sp, #30*4] | |
adr lr, coefficients | |
/* We can't cope with a width less than 16. Check for that. */ | |
cmp width, #16 | |
vpoplt {q4-q7} | |
poplt {r4-r12, pc} | |
/* Load up vectors containing the bias values. */ | |
vld1.s16 {bias_r_lo[], bias_r_hi[]}, [lr]! | |
vld1.s16 {bias_g_lo[], bias_g_hi[]}, [lr]! | |
vld1.s16 {bias_b_lo[], bias_b_hi[]}, [lr]! | |
/* Build coefficient vectors containing the same value in each element. */ | |
vmov.u8 coef_y, #C_Y_SCALE | |
vmov.u8 coef_v_r, #C_V_RED | |
vmov.u8 coef_u_g, #C_U_GREEN | |
vmov.u8 coef_v_g, #C_V_GREEN | |
vmov.u8 coef_u_b, #C_U_BLUE | |
loop_v_422: | |
mov aligned_count, width | |
/* If width is not an integer multiple of 16, run the | |
first pass through the loop with the correct number | |
of pixels to correct the size for the remaining loops. */ | |
ands count, width, #15 | |
/* If we're already aligned (i.e. count is now 0), set count | |
to 16 to run the first loop as normal. */ | |
moveq count, #16 | |
loop_h_422: | |
/* Load u and v. */ | |
vld1.u8 v, [v_ptr] | |
add v_ptr, count, ASR #1 | |
vld1.u8 u, [u_ptr] | |
add u_ptr, count, ASR #1 | |
/* Calculate contribution from chrominance signals. */ | |
vmull.u8 r_delta, v, coef_v_r | |
vmull.u8 g_delta, u, coef_u_g | |
vmlal.u8 g_delta, v, coef_v_g | |
vmull.u8 b_delta, u, coef_u_b | |
/* Attempt to preload the next set of u and v input data, for | |
better performance. */ | |
pld [v_ptr] | |
pld [u_ptr] | |
/* Load 16 pixels of y, even and odd. */ | |
vld2.u8 {y_even, y_odd}, [y_ptr], count | |
/* Add bias. */ | |
vadd.s16 r_delta, r_delta, bias_r | |
vsub.s16 g_delta, bias_g, g_delta | |
vadd.s16 b_delta, b_delta, bias_b | |
/* Even: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_even, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Even: set up alpha data. */ | |
vmov.u8 alp8_e, #0xFF | |
/* Attempt to preload the next set of Y data, for better | |
performance. */ | |
pld [y_ptr] | |
/* Even: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_e, red, #6 | |
vqrshrun.s16 grn8_e, grn, #6 | |
vqrshrun.s16 blu8_e, blu, #6 | |
/* Even: Interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_e, alp8_e | |
vzip.u8 blu8_e, grn8_e | |
/* Odd: combine luminance and chrominance. */ | |
vmull.u8 y_scale, y_odd, coef_y | |
vqadd.s16 red, y_scale, r_delta | |
vqadd.s16 grn, y_scale, g_delta | |
vqadd.s16 blu, y_scale, b_delta | |
/* Odd: set up alpha data. */ | |
vmov.u8 alp8_o, #0xFF | |
/* Odd: clamp, rescale and clip colour components to 8 bits. */ | |
vqrshrun.s16 red8_o, red, #6 | |
vqrshrun.s16 blu8_o, blu, #6 | |
vqrshrun.s16 grn8_o, grn, #6 | |
/* Odd: Interleave the colour and alpha components | |
ready for storage. */ | |
vzip.u8 red8_o, alp8_o | |
vzip.u8 blu8_o, grn8_o | |
/* Have we reached the end of the row yet? */ | |
subs aligned_count, aligned_count, count | |
/* Store 16 pixels of ARGB32, interleaving even and odd. */ | |
vst4.u16 {blu8_e, red8_e, blu8_o, red8_o}, [rgb_ptr] | |
add rgb_ptr, count, LSL #1 | |
vst4.u16 {grn8_e, alp8_e, grn8_o, alp8_o}, [rgb_ptr] | |
add rgb_ptr, count, LSL #1 | |
/* On the second (and subsequent) passes through this code, | |
we'll always be working on 16 pixels at once. */ | |
mov count, #16 | |
bgt loop_h_422 | |
/* Update pointers for new row of data. */ | |
sub rgb_ptr, width, LSL #2 | |
sub y_ptr, width | |
sub u_ptr, width, ASR #1 | |
sub v_ptr, width, ASR #1 | |
add rgb_ptr, rgb_pitch | |
add y_ptr, y_pitch | |
add u_ptr, uv_pitch | |
add v_ptr, uv_pitch | |
/* Have we reached the bottom yet? */ | |
subs height, height, #1 | |
bgt loop_v_422 | |
vpop {q4-q7} | |
pop {r4-r12, pc} | |
#ifndef __APPLE__ | |
.fnend | |
#endif | |
#endif /* __ARM_NEON__ */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment