Skip to content

Instantly share code, notes, and snippets.

@AllanChen
Created March 16, 2020 12:19
Show Gist options
  • Save AllanChen/c0cb96bf46d5959e0d2d274ed484cc83 to your computer and use it in GitHub Desktop.
Save AllanChen/c0cb96bf46d5959e0d2d274ed484cc83 to your computer and use it in GitHub Desktop.
yuv test
//
// ViewCtrl_CPUEasyImageProcessing.m
// demo_ios
//
// Created by yyuser on 2018/10/19.
//
#import "ViewCtrl_CPUEasyImageProcessing.h"
#import "vn_core.h"
#import "vn_yuvconverter.h"
#import "VN_Tensor.h"
@interface ViewCtrl_CPUEasyImageProcessing ()
@property (nonatomic, strong) UIImageView *imgView;
@property (nonatomic, assign) Venus::VenusYUVToolKit *yuvConverter;
@end
@implementation ViewCtrl_CPUEasyImageProcessing
- (void)dealloc {
delete _yuvConverter;
}
- (void)loadView {
[super loadView];
[self.view addSubview:self.imgView];
}
- (void)viewDidLoad {
_yuvConverter = nullptr;
[super viewDidLoad];
// Do any additional setup after loading the view.
}
- (UIView *)imgView {
if (!_imgView) {
_imgView = [[UIImageView alloc] initWithImage:[UIImage imageWithContentsOfFile:[[[[NSBundle mainBundle] bundlePath] stringByAppendingPathComponent:@"UIResources"] stringByAppendingPathComponent:@"icon.png"]]];
[_imgView setFrame:CGRectMake(SCREEN_WIDTH * 1 / 9.0 + (ACTUAL_SCREEN_WIDTH - SCREEN_WIDTH) / 2,
SCREEN_HEIGHT * 2 / 16.0 + (ACTUAL_SCREEN_HEIGHT - SCREEN_HEIGHT) / 2,
SCREEN_WIDTH * 7.0 / 9.0,
SCREEN_HEIGHT * 12.0 / 16.0)];
_imgView.layer.cornerRadius = 8;
    _imgView.layer.masksToBounds = YES;
    _imgView.contentMode = UIViewContentModeScaleAspectFit;
}
return _imgView;
}
-(UIImage *)getUIImage_With_Height:(int)imH
Width:(int)imW
BGRADataU8:(u_char *)data_u8 {
int imW_align = (imW + 3) / 4 * 4;
int byte_per_row = imW * 4;
int byte_per_row_align = imW_align * 4;
UIGraphicsBeginImageContext(CGSizeMake(imW_align, imH));
CGContextRef c = UIGraphicsGetCurrentContext();
u_char* data_write = (u_char*)CGBitmapContextGetData(c);
u_char* data_read = data_u8;
if (data_write != NULL && data_read != NULL) {
for (int y = 0; y < imH; y++) {
memcpy(data_write, data_read, byte_per_row);
data_write += byte_per_row_align;
data_read += byte_per_row;
}
}
UIImage *img = UIGraphicsGetImageFromCurrentImageContext();
UIGraphicsEndImageContext();
return img;
}
- (void)videoCaptureCallback:(CVPixelBufferRef)pixelBuffer
{
VN_Image input;
input.ori_fmt = VN_ORIENT_FMT_DEFAULT;
if (CVPixelBufferGetPlaneCount(pixelBuffer) == 0) {
CVPixelBufferLockBaseAddress(pixelBuffer, 0);
int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
uint8_t *baseAddress = (uint8_t*)CVPixelBufferGetBaseAddress(pixelBuffer);
unsigned char *ptr_indata = new unsigned char[iWidth * iHeight * 4];
memcpy(ptr_indata, baseAddress, iWidth * iHeight * 4);
input.width = iWidth;
input.height = iHeight;
input.channels = 4;
input.pix_fmt = VN_PIX_FMT_BGRA8888;
input.data = ptr_indata;
CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
}
else {
CVPixelBufferLockBaseAddress(pixelBuffer, 0);
int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
int bytePerRowPlane0 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0);
int bytePerRowPlane1 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1);
uint8_t *baseAddress = (uint8_t*)CVPixelBufferGetBaseAddress(pixelBuffer);
unsigned char *ptr_indata = new unsigned char[iWidth * iHeight + iWidth * iHeight / 2];//pixelBuffer;//CVPixelBufferGetBaseAddress(pixelBuffer);
{
unsigned char *ptr_indata_temp = ptr_indata;
unsigned char *ptr_pixdata_temp0 = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
for (int r = 0; r < iHeight; r++) {
memcpy(ptr_indata_temp, ptr_pixdata_temp0, iWidth);
ptr_indata_temp += iWidth;
ptr_pixdata_temp0 += bytePerRowPlane0;
}
}
{
unsigned char *ptr_indata_temp = ptr_indata + iWidth * iHeight;
unsigned char *ptr_pixdata_temp1 = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
for (int r = 0; r < iHeight / 2; r++) {
memcpy(ptr_indata_temp, ptr_pixdata_temp1, iWidth);
ptr_indata_temp += iWidth;
ptr_pixdata_temp1 += bytePerRowPlane1;
}
}
input.width = iWidth;
input.height = iHeight;
input.channels = 0;
input.pix_fmt = VN_PIX_FMT_YUV420F;
input.data = ptr_indata;
CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
}
VN_Image outCvt; {
outCvt.channels = 4;
outCvt.width = 720;
outCvt.height = 1280;
outCvt.pix_fmt = VN_PIX_FMT_RGB888;
outCvt.ori_fmt = VN_ORIENT_FMT_DEFAULT;
outCvt.data = calloc(outCvt.width * outCvt.height * outCvt.channels, sizeof(float));
}
if ((!_yuvConverter) ||
(_yuvConverter->_height_Y != input.height || _yuvConverter->_width_Y != input.width) ||
(_yuvConverter->_height_RGB != outCvt.height || _yuvConverter->_width_RGB != outCvt.width)
) {
if (_yuvConverter) {
delete _yuvConverter;
_yuvConverter = nullptr;
}
_yuvConverter = new Venus::VenusYUVToolKit(
input.width,
input.height,
input.width / 2,
input.height / 2,
outCvt.width,
outCvt.height
);
_yuvConverter->setCvtMat(
Venus::ConvertMatrixFromYUV(
1.0f, 0.0f, 1.57481f,
1.0f, -0.18732f, -0.46813f,
1.0f, 1.8556f, 0.0f,
-201.57568f,
83.897598f,
-237.5168f
)
);
}
double tic = CACurrentMediaTime();
//// _yuvConverter->convert(input, outCvt);
// Venus::VenusYUVToolKit::Convert(input, outCvt, Venus::ConvertMatrixFromYUV(
// 1.0f, 0.0f, 1.57481f,
// 1.0f, -0.18732f, -0.46813f,
// 1.0f, 1.8556f, 0.0f,
// -201.57568f,
// 83.897598f,
// -237.5168f
// ));
// nv12_to_rgb_fast_asm_ios((unsigned char *)input.data,720, 1280, (unsigned char *)outCvt.data);
convertToRGBA((unsigned char *)input.data, 720, 1280, (int *)outCvt.data);
double toc = CACurrentMediaTime();
LOGV("yuv Convert cost %f ms", 1000 * (toc - tic));
u_char *ptr_u8 = (u_char *)calloc(outCvt.width * outCvt.height * 4, sizeof(u_char));
memset(ptr_u8, 0xff, outCvt.width * outCvt.height * 4 * sizeof(u_char));
float *ptr_in_f32 = (float *)outCvt.data;
u_char *ptr_out_u8 = ptr_u8;
for (int y = 0; y < outCvt.height; y++) {
for (int x = 0; x < outCvt.width; x++) {
for (int c = 0; c < 3; c++) {
*ptr_out_u8++ = static_cast<u_char>(ptr_in_f32[c * outCvt.height * outCvt.width +
y * outCvt.width +
x]);
}
ptr_out_u8++;
}
}
UIImage *img = [self getUIImage_With_Height:outCvt.height Width:outCvt.width BGRADataU8:ptr_u8];
free(ptr_u8);
dispatch_async(dispatch_get_main_queue(), ^{
_imgView.image = img;
});
free(input.data);
free(outCvt.data);
}
void nv12_to_rgb_fast_asm_ios(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb){
const unsigned char* yptr = yuv420sp;
const unsigned char* uvptr = yuv420sp + w * h;
#if __ARM_NEON
int8x8_t _v128 = vdup_n_s8(128);
int8x8_t _v90 = vdup_n_s8(90);
int8x8_t _v46 = vdup_n_s8(46);
int8x8_t _v22 = vdup_n_s8(22);
int8x8_t _v113 = vdup_n_s8(113);
#endif
for (int y=0; y<h; y+=2)
{
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w*3;
#if __ARM_NEON
int nn = w >> 3;
int remain = w - (nn << 3);
#else
int remain = w;
#endif // __ARM_NEON
#if __ARM_NEON
#if __aarch64__
//测试,暂时没问题
// for (; nn>0; nn--)
// {
// int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
// int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
//
// int8x8_t _uuvv = vsub_s8(vreinterpret_s8_u8(vld1_u8(uvptr)), _v128); //uv - 128
// int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
// int8x8_t _uu = _uuuuvvvv.val[0];
// int8x8_t _vv = _uuuuvvvv.val[1];
//
// int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
// int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
// _g0 = vmlsl_s8(_g0, _uu, _v22);
// int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
//
// int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
// int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
// _g1 = vmlsl_s8(_g1, _uu, _v22);
// int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
//
// uint8x8x3_t _rgb0;
// _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
// _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
// _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
//
// uint8x8x3_t _rgb1;
// _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
// _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
// _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
//
// vst3_u8(rgb0, _rgb0);
// vst3_u8(rgb1, _rgb1);
//
// yptr0 += 8;
// yptr1 += 8;
// uvptr += 8;
// rgb0 += 24;
// rgb1 += 24;
// }
if (nn > 0)
{
asm volatile(
"0: \n"
"ld1 {v2.8b}, [%3], #8 \n" //uv
"sub v2.8b, v2.8b, %12.8b \n" //uv - 128
"ld1 {v0.8b}, [%1], #8 \n" //yptr----r0
"ld1 {v1.8b}, [%2], #8 \n" //yptr----r1
"ushll v4.8h, v0.8b, #6 \n" //r0---y<<6
"orr v3.8b, v2.8b, v2.8b \n" //copy of vu
"ushll v5.8h, v1.8b, #6 \n" //r1---y<<6
"orr v9.16b, v4.16b, v4.16b \n" //copy of r0---y<<6
"trn1 v14.8b, v2.8b, v3.8b \n" //v14 = u
"trn2 v13.8b, v2.8b, v3.8b \n" //v13 = v
"orr v11.16b, v5.16b, v5.16b \n" //copy of r1---y<<6
"smlsl v9.8h, v13.8b, %14.8b \n" // r0---- (y << 6) - v * 46
"orr v8.16b, v4.16b, v4.16b \n" //copy of r0---y<<6
"smlsl v11.8h, v13.8b, %14.8b \n" // r1---- (y << 6) - v * 46
"orr v10.16b, v5.16b, v5.16b \n" //copy of r1---y<<6
"smlal v8.8h, v13.8b, %13.8b \n" //r0--- r = (y<<6) + v * 90
"smlal v4.8h, v14.8b, %16.8b \n" //r0--- b = (y<<6) + u * 133
"smlal v10.8h, v13.8b, %13.8b \n" //r1--- r = (y<<6) + v * 90
"smlsl v9.8h, v14.8b, %15.8b \n" //r0--- g = (y << 6) - v * 46 - u * 22
"smlal v5.8h, v14.8b, %16.8b \n" //r1--- b = (y<<6) + u * 133
"smlsl v11.8h, v14.8b, %15.8b \n" //r1--- g = (y << 6) - v * 46 - u * 22
"sqshrun v15.8b, v8.8h, #6 \n" //r0--- r
"sqshrun v17.8b, v4.8h, #6 \n" //r0--- b
"sqshrun v18.8b, v10.8h, #6 \n" //r1--- r
"sqshrun v16.8b, v9.8h, #6 \n" //r0--- g
"sqshrun v20.8b, v5.8h, #6 \n" //r1--- b
"sqshrun v19.8b, v11.8h, #6 \n" //r1--- g
"subs %w0, %w0, #1 \n"
"st3 {v15.8b, v16.8b, v17.8b}, [%4], #24 \n"
"st3 {v18.8b, v19.8b, v20.8b}, [%5], #24 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(yptr0), // %1
"=r"(yptr1), // %2
"=r"(uvptr), // %3
"=r"(rgb0), // %4
"=r"(rgb1) // %5
: "0"(nn),
"1"(yptr0),
"2"(yptr1),
"3"(uvptr),
"4"(rgb0),
"5"(rgb1),
"w"(_v128), // %12
"w"(_v90), // %13
"w"(_v46), // %14
"w"(_v22), // %15
"w"(_v113) // %16
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20"
);
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%3, #128] \n"
"vld1.u8 {d2}, [%3]! \n" //uv
"vsub.s8 d2, d2, %12 \n" //uv - 128
"pld [%1, #128] \n"
"vld1.u8 {d0}, [%1]! \n" //yptr----r0
"pld [%2, #128] \n"
"vld1.u8 {d1}, [%2]! \n" //yptr----r1
"vshll.u8 q2, d0, #6 \n" //r0---y
"vorr d3, d2, d2 \n"
"vshll.u8 q3, d1, #6 \n" //r1---y
"vorr q9, q2, q2 \n" //r0---y
"vtrn.s8 d2, d3 \n" //d2 = u, d3 = v
"vorr q11, q3, q3 \n" //r1---y
"vmlsl.s8 q9, d3, %14 \n" //r0---- y - v * 46
"vorr q8, q2, q2 \n" //r0---y
"vmlsl.s8 q11, d3, %14 \n" //r1---- y - v * 46
"vorr q10, q3, q3 \n" //r1---y
"vmlal.s8 q8, d3, %13 \n" //r0----r = y + v * 90
"vmlal.s8 q2, d2, %16 \n" //r0----b = y + u * 133
"vmlal.s8 q10, d3, %13 \n" //r1----r = y + v * 90
"vmlsl.s8 q9, d2, %15 \n" //r0----g = (y - v * 46) - u * 22
"vmlal.s8 q3, d2, %16 \n" //r1----b = y + u * 133
"vmlsl.s8 q11, d2, %15 \n" //r1----g = (y - v * 46) - u * 22
"vqshrun.s16 d24, q8, #6 \n" // r0---r
"vqshrun.s16 d26, q2, #6 \n" // r0---b
"vqshrun.s16 d4, q10, #6 \n" // r1---r
"vqshrun.s16 d25, q9, #6 \n" // r0---g
"vqshrun.s16 d6, q3, #6 \n" // r1---b
"vqshrun.s16 d5, q11, #6 \n" // r1---g
"subs %0, #1 \n"
"vst3.u8 {d24-d26}, [%4]! \n"
//"vsub.s8 d2, d2, %12 \n"
"vst3.u8 {d4-d6}, [%5]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(yptr0), // %1
"=r"(yptr1), // %2
"=r"(uvptr), // %3
"=r"(rgb0), // %4
"=r"(rgb1) // %5
: "0"(nn),
"1"(yptr0),
"2"(yptr1),
"3"(uvptr),
"4"(rgb0),
"5"(rgb1),
"w"(_v128), // %12
"w"(_v90), // %13
"w"(_v46), // %14
"w"(_v22), // %15
"w"(_v113) // %16
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
// remain = w;
for (; remain>0; remain = remain - 2)
{
// R = 1.164 * yy + 1.596 * vv
// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
// B = 1.164 * yy + 2.018 * uu
// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))
// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6
// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6
// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6
int u = uvptr[0] - 128;
int v = uvptr[1] - 128;
int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;
int y00 = yptr0[0] << 6;
rgb0[0] = static_cast<unsigned char>((y00 + ruv) >> 6);
rgb0[1] = static_cast<unsigned char>((y00 + guv) >> 6);
rgb0[2] = static_cast<unsigned char>((y00 + buv) >> 6);
int y01 = yptr0[1] << 6;
rgb0[3] = static_cast<unsigned char>((y01 + ruv) >> 6);
rgb0[4] = static_cast<unsigned char>((y01 + guv) >> 6);
rgb0[5] = static_cast<unsigned char>((y01 + buv) >> 6);
int y10 = yptr1[0] << 6;
rgb1[0] = static_cast<unsigned char>((y10 + ruv) >> 6);
rgb1[1] = static_cast<unsigned char>((y10 + guv) >> 6);
rgb1[2] = static_cast<unsigned char>((y10 + buv) >> 6);
int y11 = yptr1[1] << 6;
rgb1[3] = static_cast<unsigned char>((y11 + ruv) >> 6);
rgb1[4] = static_cast<unsigned char>((y11 + guv) >> 6);
rgb1[5] = static_cast<unsigned char>((y11 + buv) >> 6);
yptr0 += 2;
yptr1 += 2;
uvptr += 2;
rgb0 += 6;
rgb1 += 6;
}
yptr += 2*w;
rgb += 2*3*w;
}
}
void convertToRGBA(unsigned char* yuv, int w, int h, int* rgba)
{
for (int i=0; i<h; ++i)
{
unsigned char* dst = (unsigned char*)(rgba + w*i);
unsigned char* y = yuv + w*i;
unsigned char* uv = yuv + w*h + w*(i/2);
int count = w;
#ifdef HAS_NEON
/*一次处理16个像素*/
int c = count/16;
asm volatile(
"movs r4, %[c]\t\n"
"beq 2f\t\n"
"vmov.u8 d7, #255\t\n"//Alpha
"vmov.u8 d3, #255\t\n"//Alpha
"vmov.s16 q11, #90\t\n"
"vmov.s16 q12, #128\t\n"
"vmov.s16 q13, #21\t\n"
"vmov.s16 q14, #46\t\n"
"vmov.s16 q15, #113\t\n"
"1:\t\n"
/*Y1 Y2 是交错的两组像素的Y分量,与 UV分量值 正好一一对应*/
"vld2.8 {d8, d9}, [%[y]]!\t\n"//Y1, Y2
/*交错取出 UV 值*/
"vld2.8 {d0, d1}, [%[uv]]!\t\n"//u, v
"vmovl.u8 q5, d0\t\n"
"vmovl.u8 q6, d1\t\n"
"vsub.i16 q5,q5, q12\t\n"//U
"vsub.i16 q6,q6, q12\t\n"//V
//First RGBA
"vshll.u8 q7, d8, #6\t\n"
"vshll.u8 q8, d8, #6\t\n"
"vshll.u8 q9, d8, #6\t\n"
"vmla.i16 q7, q6, q11\t\n"
"vmls.i16 q8, q5, q13\t\n"
"vmls.i16 q8, q6, q14\t\n"
"vmla.i16 q9, q5, q15\t\n"
"vshr.s16 q7, q7, #6\t\n"
"vshr.s16 q8, q8, #6\t\n"
"vshr.s16 q9, q9, #6\t\n"
"vmov.s16 q10, #0\t\n"
"vmax.s16 q7, q7, q10\t\n"
"vmax.s16 q8, q8, q10\t\n"
"vmax.s16 q9, q9, q10\t\n"
"vmov.u16 q10, #255\t\n"
"vmin.u16 q7, q7, q10\t\n"
"vmin.u16 q8, q8, q10\t\n"
"vmin.u16 q9, q9, q10\t\n"
"vmovn.s16 d2, q7\t\n"
"vmovn.s16 d1, q8\t\n"
"vmovn.s16 d0, q9\t\n"
//Second RGBA
"vshll.u8 q7, d9, #6\t\n"
"vshll.u8 q8, d9, #6\t\n"
"vshll.u8 q9, d9, #6\t\n"
"vmla.i16 q7, q6, q11\t\n"
"vmls.i16 q8, q5, q13\t\n"
"vmls.i16 q8, q6, q14\t\n"
"vmla.i16 q9, q5, q15\t\n"
"vshr.s16 q7, q7, #6\t\n"
"vshr.s16 q8, q8, #6\t\n"
"vshr.s16 q9, q9, #6\t\n"
"vmov.s16 q10, #0\t\n"
"vmax.s16 q7, q7, q10\t\n"
"vmax.s16 q8, q8, q10\t\n"
"vmax.s16 q9, q9, q10\t\n"
"vmov.u16 q10, #255\t\n"
"vmin.u16 q7, q7, q10\t\n"
"vmin.u16 q8, q8, q10\t\n"
"vmin.u16 q9, q9, q10\t\n"
"vmovn.s16 d6, q7\t\n"
"vmovn.s16 d5, q8\t\n"
"vmovn.s16 d4, q9\t\n"
/*目前我们得到的两组RGB分量值是交错的,
* 比如:
* d0 : (g0 g2 g4 g6 g8 g10 g12 g14)
* d4 : (g1 g3 g5 g7 g9 g11 g13 g15)
* 需要做交织,变成如下再存储:
* d0 :(g0 g1 g2 g3 g4 g5 g6 g7)
* d4 :(g8 g9 g10 g11 g12 g13 g14 g15)
*/
"vtrn.8 d2,d6\t\n"
"vtrn.16 d2,d6\t\n"
"vtrn.32 d2,d6\t\n"
"vtrn.8 d1,d5\t\n"
"vtrn.16 d1,d5\t\n"
"vtrn.32 d1,d5\t\n"
"vtrn.8 d0,d4\t\n"
"vtrn.16 d0,d4\t\n"
"vtrn.32 d0,d4\t\n"
"vst4.8 {d0-d3}, [%[dst]]!\t\n"
"vst4.8 {d4-d7}, [%[dst]]!\t\n"
"subs r4, r4, #1\t\n"
"bne 1b\t\n"
"2:\t\n"
: [dst] "+r" (dst), [y] "+r" (y), [uv] "+r" (uv), [c] "+r" (c)
:
: "r4", "cc","memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
);
count%=16;
#endif
/*边角料的处理*/
int r, g, b;
while (count > 1)
{
unsigned char _y = y[0];
unsigned char _u = uv[0];
unsigned char _v = uv[1];
r = _y + ((179*(_v-128))>>7);
g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
b = _y + ((227*(_u-128))>>7);
r = r<0?0:r;r=r>255?255:r;
g = g<0?0:g;g=g>255?255:g;
b = b<0?0:b;b=b>255?255:b;
dst[0] = b;
dst[1] = g;
dst[2] = r;
dst[3] = 0xFF;
y++;
dst+=4;
_y = y[0];
r = _y + ((179*(_v-128))>>7);
g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
b = _y + ((227*(_u-128))>>7);
r = r<0?0:r;r=r>255?255:r;
g = g<0?0:g;g=g>255?255:g;
b = b<0?0:b;b=b>255?255:b;
dst[0] = b;
dst[1] = g;
dst[2] = r;
dst[3] = 0xFF;
y++;
uv+=2;
dst+=4;
count-=2;
}
if (count > 0)
{
unsigned char _y = y[0];
unsigned char _u = uv[0];
unsigned char _v = uv[1];
r = _y + ((179*(_v-128))>>7);
g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
b = _y + ((227*(_u-128))>>7);
r = r<0?0:r;r=r>255?255:r;
g = g<0?0:g;g=g>255?255:g;
b = b<0?0:b;b=b>255?255:b;
dst[0] = b;
dst[1] = g;
dst[2] = r;
dst[3] = 0xFF;
}
}
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment