AllanChen/gist:c0cb96bf46d5959e0d2d274ed484cc83

## gistfile1.txt
//
//  ViewCtrl_CPUEasyImageProcessing.m
//  demo_ios
//
//  Created by yyuser on 2018/10/19.
//

#import "ViewCtrl_CPUEasyImageProcessing.h"
#import "vn_core.h"
#import "vn_yuvconverter.h"
#import "VN_Tensor.h"

@interface ViewCtrl_CPUEasyImageProcessing ()
@property (nonatomic, strong) UIImageView *imgView;
@property (nonatomic, assign) Venus::VenusYUVToolKit *yuvConverter;
@end

@implementation ViewCtrl_CPUEasyImageProcessing

- (void)dealloc {
    delete _yuvConverter;
}

- (void)loadView {
    [super loadView];
    [self.view addSubview:self.imgView];
}

- (void)viewDidLoad {
    _yuvConverter = nullptr;
    [super viewDidLoad];
    // Do any additional setup after loading the view.
}

- (UIView *)imgView {
    if (!_imgView) {
        _imgView = [[UIImageView alloc] initWithImage:[UIImage imageWithContentsOfFile:[[[[NSBundle mainBundle] bundlePath] stringByAppendingPathComponent:@"UIResources"] stringByAppendingPathComponent:@"icon.png"]]];
        [_imgView setFrame:CGRectMake(SCREEN_WIDTH * 1 / 9.0 + (ACTUAL_SCREEN_WIDTH - SCREEN_WIDTH) / 2,
                                      SCREEN_HEIGHT * 2 / 16.0 + (ACTUAL_SCREEN_HEIGHT - SCREEN_HEIGHT) / 2,
                                      SCREEN_WIDTH * 7.0 / 9.0,
                                      SCREEN_HEIGHT * 12.0 / 16.0)];
        _imgView.layer.cornerRadius = 8;
        _imgView.layer.masksToBounds = YES;
        _imgView.contentMode = UIViewContentModeScaleAspectFit;
    }
    return _imgView;
}

-(UIImage *)getUIImage_With_Height:(int)imH
                             Width:(int)imW
                        BGRADataU8:(u_char *)data_u8 {
    int imW_align = (imW + 3) / 4 * 4;
    int byte_per_row = imW * 4;
    int byte_per_row_align = imW_align * 4;
    UIGraphicsBeginImageContext(CGSizeMake(imW_align, imH));
    CGContextRef c = UIGraphicsGetCurrentContext();
    u_char* data_write = (u_char*)CGBitmapContextGetData(c);
    u_char* data_read = data_u8;
    if (data_write != NULL && data_read != NULL) {
        for (int y = 0; y < imH; y++) {
            memcpy(data_write, data_read, byte_per_row);
            data_write += byte_per_row_align;
            data_read += byte_per_row;
        }
    }
    UIImage *img = UIGraphicsGetImageFromCurrentImageContext();
    UIGraphicsEndImageContext();
    return img;
}


- (void)videoCaptureCallback:(CVPixelBufferRef)pixelBuffer
{
    VN_Image input;
    input.ori_fmt = VN_ORIENT_FMT_DEFAULT;
    if (CVPixelBufferGetPlaneCount(pixelBuffer) == 0) {
        CVPixelBufferLockBaseAddress(pixelBuffer, 0);
        int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
        int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
        int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
        uint8_t *baseAddress = (uint8_t*)CVPixelBufferGetBaseAddress(pixelBuffer);
        unsigned char *ptr_indata = new unsigned char[iWidth * iHeight * 4];
        memcpy(ptr_indata, baseAddress, iWidth * iHeight * 4);
        input.width = iWidth;
        input.height = iHeight;
        input.channels = 4;
        input.pix_fmt = VN_PIX_FMT_BGRA8888;
        input.data = ptr_indata;
        CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
    }
    else {
        CVPixelBufferLockBaseAddress(pixelBuffer, 0);
        int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
        int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
        int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
        int bytePerRowPlane0 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0);
        int bytePerRowPlane1 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1);
        uint8_t *baseAddress = (uint8_t*)CVPixelBufferGetBaseAddress(pixelBuffer);
        unsigned char *ptr_indata = new unsigned char[iWidth * iHeight + iWidth * iHeight / 2];//pixelBuffer;//CVPixelBufferGetBaseAddress(pixelBuffer);
        {
            unsigned char *ptr_indata_temp = ptr_indata;
            unsigned char *ptr_pixdata_temp0 = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
            for (int r = 0; r < iHeight; r++) {
                memcpy(ptr_indata_temp, ptr_pixdata_temp0, iWidth);
                ptr_indata_temp += iWidth;
                ptr_pixdata_temp0 += bytePerRowPlane0;
            }
        }
        {
            unsigned char *ptr_indata_temp = ptr_indata + iWidth * iHeight;
            unsigned char *ptr_pixdata_temp1 = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
            for (int r = 0; r < iHeight / 2; r++) {
                memcpy(ptr_indata_temp, ptr_pixdata_temp1, iWidth);
                ptr_indata_temp += iWidth;
                ptr_pixdata_temp1 += bytePerRowPlane1;
            }
        }

        input.width = iWidth;
        input.height = iHeight;
        input.channels = 0;
        input.pix_fmt = VN_PIX_FMT_YUV420F;
        input.data = ptr_indata;
        CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
    }

    VN_Image outCvt; {
        outCvt.channels = 4;
        outCvt.width = 720;
        outCvt.height = 1280;
        outCvt.pix_fmt = VN_PIX_FMT_RGB888;
        outCvt.ori_fmt = VN_ORIENT_FMT_DEFAULT;
        outCvt.data = calloc(outCvt.width * outCvt.height * outCvt.channels, sizeof(float));
    }

    if ((!_yuvConverter) ||
        (_yuvConverter->_height_Y != input.height || _yuvConverter->_width_Y != input.width) ||
        (_yuvConverter->_height_RGB != outCvt.height || _yuvConverter->_width_RGB != outCvt.width)
        ) {

        if (_yuvConverter) {
            delete _yuvConverter;
            _yuvConverter = nullptr;
        }

        _yuvConverter = new Venus::VenusYUVToolKit(
                                                         input.width,
                                                         input.height,
                                                         input.width / 2,
                                                         input.height / 2,
                                                         outCvt.width,
                                                         outCvt.height
                                                         );
        _yuvConverter->setCvtMat(
                                 Venus::ConvertMatrixFromYUV(
                                                                  1.0f, 0.0f,      1.57481f,
                                                                  1.0f, -0.18732f, -0.46813f,
                                                                  1.0f, 1.8556f,   0.0f,
                                                                  -201.57568f,
                                                                  83.897598f,
                                                                  -237.5168f
                                                                  )
                                 );
    }

    double tic = CACurrentMediaTime();
////    _yuvConverter->convert(input, outCvt);
//    Venus::VenusYUVToolKit::Convert(input, outCvt, Venus::ConvertMatrixFromYUV(
//                                                                                     1.0f, 0.0f,      1.57481f,
//                                                                                     1.0f, -0.18732f, -0.46813f,
//                                                                                     1.0f, 1.8556f,   0.0f,
//                                                                                     -201.57568f,
//                                                                                     83.897598f,
//                                                                                     -237.5168f
//                                                                                     ));


//    nv12_to_rgb_fast_asm_ios((unsigned char *)input.data,720, 1280, (unsigned char *)outCvt.data);
    convertToRGBA((unsigned char *)input.data, 720, 1280, (int *)outCvt.data);

    double toc = CACurrentMediaTime();
    LOGV("yuv Convert cost %f ms", 1000 * (toc - tic));

    u_char *ptr_u8 = (u_char *)calloc(outCvt.width * outCvt.height * 4, sizeof(u_char));
    memset(ptr_u8, 0xff, outCvt.width * outCvt.height * 4 * sizeof(u_char));
    float *ptr_in_f32 = (float *)outCvt.data;
    u_char *ptr_out_u8 = ptr_u8;
    for (int y = 0; y < outCvt.height; y++) {
        for (int x = 0; x < outCvt.width; x++) {
            for (int c = 0; c < 3; c++) {
                *ptr_out_u8++ = static_cast<u_char>(ptr_in_f32[c * outCvt.height * outCvt.width +
                                                               y * outCvt.width +
                                                               x]);
            }
            ptr_out_u8++;
        }
    }
    UIImage *img = [self getUIImage_With_Height:outCvt.height Width:outCvt.width BGRADataU8:ptr_u8];
    free(ptr_u8);

    dispatch_async(dispatch_get_main_queue(), ^{
        _imgView.image = img;
    });
    free(input.data);
    free(outCvt.data);
}

void nv12_to_rgb_fast_asm_ios(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb){

    const unsigned char* yptr = yuv420sp;
    const unsigned char* uvptr = yuv420sp + w * h;
#if __ARM_NEON
    int8x8_t _v128 = vdup_n_s8(128);
    int8x8_t _v90 = vdup_n_s8(90);
    int8x8_t _v46 = vdup_n_s8(46);
    int8x8_t _v22 = vdup_n_s8(22);
    int8x8_t _v113 = vdup_n_s8(113);
#endif

    for (int y=0; y<h; y+=2)
        {
        const unsigned char* yptr0 = yptr;
        const unsigned char* yptr1 = yptr + w;
        unsigned char* rgb0 = rgb;
        unsigned char* rgb1 = rgb + w*3;

#if __ARM_NEON
        int nn = w >> 3;
        int remain = w - (nn << 3);
#else
        int remain = w;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__

        //测试，暂时没问题
        //            for (; nn>0; nn--)
        //            {
        //                int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
        //                int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
        //
        //                int8x8_t _uuvv = vsub_s8(vreinterpret_s8_u8(vld1_u8(uvptr)), _v128); //uv - 128
        //                int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
        //                int8x8_t _uu = _uuuuvvvv.val[0];
        //                int8x8_t _vv = _uuuuvvvv.val[1];
        //
        //                int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
        //                int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
        //                _g0 = vmlsl_s8(_g0, _uu, _v22);
        //                int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
        //
        //                int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
        //                int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
        //                _g1 = vmlsl_s8(_g1, _uu, _v22);
        //                int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
        //
        //                uint8x8x3_t _rgb0;
        //                _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
        //                _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
        //                _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
        //
        //                uint8x8x3_t _rgb1;
        //                _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
        //                _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
        //                _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
        //
        //                vst3_u8(rgb0, _rgb0);
        //                vst3_u8(rgb1, _rgb1);
        //
        //                yptr0 += 8;
        //                yptr1 += 8;
        //                uvptr += 8;
        //                rgb0 += 24;
        //                rgb1 += 24;
        //            }
        if (nn > 0)
            {
            asm volatile(
                         "0:                                 \n"
                         "ld1        {v2.8b}, [%3], #8       \n"     //uv
                         "sub        v2.8b, v2.8b, %12.8b    \n"     //uv - 128
                         "ld1        {v0.8b}, [%1], #8       \n"     //yptr----r0
                         "ld1        {v1.8b}, [%2], #8       \n"     //yptr----r1
                         "ushll      v4.8h, v0.8b, #6        \n"     //r0---y<<6
                         "orr        v3.8b, v2.8b, v2.8b     \n"     //copy of vu
                         "ushll      v5.8h, v1.8b, #6        \n"     //r1---y<<6
                         "orr        v9.16b, v4.16b, v4.16b  \n"     //copy of r0---y<<6
                         "trn1       v14.8b, v2.8b, v3.8b    \n"     //v14 = u
                         "trn2       v13.8b, v2.8b, v3.8b    \n"     //v13 = v
                         "orr        v11.16b, v5.16b, v5.16b \n"     //copy of r1---y<<6
                         "smlsl      v9.8h, v13.8b, %14.8b   \n"     // r0---- (y << 6) - v * 46
                         "orr        v8.16b, v4.16b, v4.16b  \n"     //copy of r0---y<<6
                         "smlsl      v11.8h, v13.8b, %14.8b  \n"     // r1---- (y << 6) - v * 46
                         "orr        v10.16b, v5.16b, v5.16b \n"     //copy of r1---y<<6
                         "smlal      v8.8h, v13.8b, %13.8b   \n"     //r0--- r = (y<<6) + v * 90
                         "smlal      v4.8h, v14.8b, %16.8b   \n"     //r0--- b = (y<<6) + u * 133
                         "smlal      v10.8h, v13.8b, %13.8b  \n"     //r1--- r = (y<<6) + v * 90
                         "smlsl      v9.8h, v14.8b, %15.8b   \n"     //r0--- g = (y << 6) - v * 46 - u * 22
                         "smlal      v5.8h, v14.8b, %16.8b   \n"     //r1--- b = (y<<6) + u * 133
                         "smlsl      v11.8h, v14.8b, %15.8b  \n"     //r1--- g = (y << 6) - v * 46 - u * 22
                         "sqshrun    v15.8b, v8.8h, #6       \n"     //r0--- r
                         "sqshrun    v17.8b, v4.8h, #6       \n"     //r0--- b
                         "sqshrun    v18.8b, v10.8h, #6      \n"     //r1--- r
                         "sqshrun    v16.8b, v9.8h, #6       \n"     //r0--- g
                         "sqshrun    v20.8b, v5.8h, #6       \n"     //r1--- b
                         "sqshrun    v19.8b, v11.8h, #6      \n"     //r1--- g
                         "subs       %w0, %w0, #1            \n"
                         "st3        {v15.8b, v16.8b, v17.8b}, [%4], #24   \n"
                         "st3        {v18.8b, v19.8b, v20.8b}, [%5], #24   \n"
                         "bne        0b                      \n"
                         : "=r"(nn),     // %0
                         "=r"(yptr0),  // %1
                         "=r"(yptr1),  // %2
                         "=r"(uvptr),  // %3
                         "=r"(rgb0),   // %4
                         "=r"(rgb1)    // %5
                         : "0"(nn),
                         "1"(yptr0),
                         "2"(yptr1),
                         "3"(uvptr),
                         "4"(rgb0),
                         "5"(rgb1),
                         "w"(_v128),   // %12
                         "w"(_v90),    // %13
                         "w"(_v46),    // %14
                         "w"(_v22),    // %15
                         "w"(_v113)    // %16
                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", "v13",
                         "v14", "v15", "v16", "v17", "v18", "v19", "v20"
                         );
            }
#else
        if (nn > 0)
            {
            asm volatile(
                         "0:                             \n"
                         "pld        [%3, #128]          \n"
                         "vld1.u8    {d2}, [%3]!         \n"         //uv
                         "vsub.s8    d2, d2, %12         \n"         //uv - 128
                         "pld        [%1, #128]          \n"
                         "vld1.u8    {d0}, [%1]!         \n"         //yptr----r0
                         "pld        [%2, #128]          \n"
                         "vld1.u8    {d1}, [%2]!         \n"         //yptr----r1
                         "vshll.u8   q2, d0, #6          \n"         //r0---y
                         "vorr       d3, d2, d2          \n"
                         "vshll.u8   q3, d1, #6          \n"         //r1---y
                         "vorr       q9, q2, q2          \n"         //r0---y
                         "vtrn.s8    d2, d3              \n"         //d2 = u, d3 = v
                         "vorr       q11, q3, q3         \n"         //r1---y
                         "vmlsl.s8   q9, d3, %14         \n"         //r0---- y - v * 46
                         "vorr       q8, q2, q2          \n"         //r0---y
                         "vmlsl.s8   q11, d3, %14        \n"         //r1---- y - v * 46
                         "vorr       q10, q3, q3         \n"         //r1---y
                         "vmlal.s8   q8, d3, %13         \n"         //r0----r = y + v * 90
                         "vmlal.s8   q2, d2, %16         \n"         //r0----b = y + u * 133
                         "vmlal.s8   q10, d3, %13        \n"         //r1----r = y + v * 90
                         "vmlsl.s8   q9, d2, %15         \n"         //r0----g = (y - v * 46) - u * 22
                         "vmlal.s8   q3, d2, %16         \n"         //r1----b = y + u * 133
                         "vmlsl.s8   q11, d2, %15        \n"         //r1----g = (y - v * 46) - u * 22
                         "vqshrun.s16 d24, q8, #6        \n"         // r0---r
                         "vqshrun.s16 d26, q2, #6        \n"         // r0---b
                         "vqshrun.s16 d4, q10, #6        \n"         // r1---r
                         "vqshrun.s16 d25, q9, #6        \n"         // r0---g
                         "vqshrun.s16 d6, q3, #6         \n"         // r1---b
                         "vqshrun.s16 d5, q11, #6        \n"         // r1---g
                         "subs       %0, #1              \n"
                         "vst3.u8    {d24-d26}, [%4]!    \n"
                         //"vsub.s8    d2, d2, %12         \n"
                         "vst3.u8    {d4-d6}, [%5]!      \n"
                         "bne        0b                  \n"
                         : "=r"(nn),     // %0
                         "=r"(yptr0),  // %1
                         "=r"(yptr1),  // %2
                         "=r"(uvptr),  // %3
                         "=r"(rgb0),   // %4
                         "=r"(rgb1)    // %5
                         : "0"(nn),
                         "1"(yptr0),
                         "2"(yptr1),
                         "3"(uvptr),
                         "4"(rgb0),
                         "5"(rgb1),
                         "w"(_v128),   // %12
                         "w"(_v90),    // %13
                         "w"(_v46),    // %14
                         "w"(_v22),    // %15
                         "w"(_v113)    // %16
                         : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
                         );
            }
#endif // __aarch64__
#endif // __ARM_NEON
        //            remain = w;
        for (; remain>0; remain = remain - 2)
            {
            // R = 1.164 * yy + 1.596 * vv
            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
            // B = 1.164 * yy              + 2.018 * uu

            // R = Y + (1.370705 * (V-128))
            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
            // B = Y + (1.732446 * (U-128))

            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6

            // R = ((Y << 6) + 90 * (V-128)) >> 6
            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
            // B = ((Y << 6) + 113 * (U-128)) >> 6

            // R = (yy + 90 * vv) >> 6
            // G = (yy - 46 * vv - 22 * uu) >> 6
            // B = (yy + 113 * uu) >> 6

            int u = uvptr[0] - 128;
            int v = uvptr[1] - 128;

            int ruv = 90 * v;
            int guv = -46 * v + -22 * u;
            int buv = 113 * u;

            int y00 = yptr0[0] << 6;
            rgb0[0] = static_cast<unsigned char>((y00 + ruv) >> 6);
            rgb0[1] = static_cast<unsigned char>((y00 + guv) >> 6);
            rgb0[2] = static_cast<unsigned char>((y00 + buv) >> 6);

            int y01 = yptr0[1] << 6;
            rgb0[3] = static_cast<unsigned char>((y01 + ruv) >> 6);
            rgb0[4] = static_cast<unsigned char>((y01 + guv) >> 6);
            rgb0[5] = static_cast<unsigned char>((y01 + buv) >> 6);

            int y10 = yptr1[0] << 6;
            rgb1[0] = static_cast<unsigned char>((y10 + ruv) >> 6);
            rgb1[1] = static_cast<unsigned char>((y10 + guv) >> 6);
            rgb1[2] = static_cast<unsigned char>((y10 + buv) >> 6);

            int y11 = yptr1[1] << 6;
            rgb1[3] = static_cast<unsigned char>((y11 + ruv) >> 6);
            rgb1[4] = static_cast<unsigned char>((y11 + guv) >> 6);
            rgb1[5] = static_cast<unsigned char>((y11 + buv) >> 6);

            yptr0 += 2;
            yptr1 += 2;
            uvptr += 2;
            rgb0 += 6;
            rgb1 += 6;
            }

        yptr += 2*w;
        rgb += 2*3*w;
        }
}
void convertToRGBA(unsigned char* yuv, int w, int h, int* rgba)
{
    for (int i=0; i<h; ++i)
        {
        unsigned char* dst = (unsigned char*)(rgba + w*i);
        unsigned char* y = yuv + w*i;
        unsigned char* uv = yuv + w*h + w*(i/2);
        int count = w;
#ifdef HAS_NEON
        /*一次处理16个像素*/
        int c = count/16;
        asm volatile(
                     "movs r4, %[c]\t\n"
                     "beq 2f\t\n"
                     "vmov.u8 d7, #255\t\n"//Alpha
                     "vmov.u8 d3, #255\t\n"//Alpha
                     "vmov.s16 q11, #90\t\n"
                     "vmov.s16 q12, #128\t\n"
                     "vmov.s16 q13, #21\t\n"
                     "vmov.s16 q14, #46\t\n"
                     "vmov.s16 q15, #113\t\n"
                     "1:\t\n"
                     /*Y1 Y2 是交错的两组像素的Y分量，与 UV分量值 正好一一对应*/
                     "vld2.8 {d8, d9}, [%[y]]!\t\n"//Y1, Y2
                     /*交错取出 UV 值*/
                     "vld2.8 {d0, d1}, [%[uv]]!\t\n"//u, v
                     "vmovl.u8  q5, d0\t\n"
                     "vmovl.u8  q6, d1\t\n"
                     "vsub.i16 q5,q5, q12\t\n"//U
                     "vsub.i16 q6,q6, q12\t\n"//V
                     //First RGBA
                     "vshll.u8 q7, d8, #6\t\n"
                     "vshll.u8 q8, d8, #6\t\n"
                     "vshll.u8 q9, d8, #6\t\n"
                     "vmla.i16 q7, q6, q11\t\n"
                     "vmls.i16 q8, q5, q13\t\n"
                     "vmls.i16 q8, q6, q14\t\n"
                     "vmla.i16 q9, q5, q15\t\n"

                     "vshr.s16 q7, q7, #6\t\n"
                     "vshr.s16 q8, q8, #6\t\n"
                     "vshr.s16 q9, q9, #6\t\n"
                     "vmov.s16 q10, #0\t\n"
                     "vmax.s16 q7, q7, q10\t\n"
                     "vmax.s16 q8, q8, q10\t\n"
                     "vmax.s16 q9, q9, q10\t\n"
                     "vmov.u16 q10, #255\t\n"
                     "vmin.u16 q7, q7, q10\t\n"
                     "vmin.u16 q8, q8, q10\t\n"
                     "vmin.u16 q9, q9, q10\t\n"
                     "vmovn.s16 d2, q7\t\n"
                     "vmovn.s16 d1, q8\t\n"
                     "vmovn.s16 d0, q9\t\n"

                     //Second RGBA
                     "vshll.u8 q7, d9, #6\t\n"
                     "vshll.u8 q8, d9, #6\t\n"
                     "vshll.u8 q9, d9, #6\t\n"
                     "vmla.i16 q7, q6, q11\t\n"
                     "vmls.i16 q8, q5, q13\t\n"
                     "vmls.i16 q8, q6, q14\t\n"
                     "vmla.i16 q9, q5, q15\t\n"
                     "vshr.s16 q7, q7, #6\t\n"
                     "vshr.s16 q8, q8, #6\t\n"
                     "vshr.s16 q9, q9, #6\t\n"
                     "vmov.s16 q10, #0\t\n"
                     "vmax.s16 q7, q7, q10\t\n"
                     "vmax.s16 q8, q8, q10\t\n"
                     "vmax.s16 q9, q9, q10\t\n"
                     "vmov.u16 q10, #255\t\n"
                     "vmin.u16 q7, q7, q10\t\n"
                     "vmin.u16 q8, q8, q10\t\n"
                     "vmin.u16 q9, q9, q10\t\n"
                     "vmovn.s16 d6, q7\t\n"
                     "vmovn.s16 d5, q8\t\n"
                     "vmovn.s16 d4, q9\t\n"
        /*目前我们得到的两组RGB分量值是交错的，
         * 比如：
         * d0 : (g0 g2 g4 g6 g8 g10 g12 g14)
         * d4 : (g1 g3 g5 g7 g9 g11 g13 g15)
         * 需要做交织，变成如下再存储：
         * d0 ：(g0 g1 g2 g3 g4 g5 g6 g7)
         * d4 ：(g8 g9 g10 g11 g12 g13 g14 g15)
         */

                     "vtrn.8 d2,d6\t\n"
                     "vtrn.16 d2,d6\t\n"
                     "vtrn.32 d2,d6\t\n"

                     "vtrn.8 d1,d5\t\n"
                     "vtrn.16 d1,d5\t\n"
                     "vtrn.32 d1,d5\t\n"

                     "vtrn.8 d0,d4\t\n"
                     "vtrn.16 d0,d4\t\n"
                     "vtrn.32 d0,d4\t\n"

                     "vst4.8 {d0-d3}, [%[dst]]!\t\n"
                     "vst4.8 {d4-d7}, [%[dst]]!\t\n"

                     "subs r4, r4, #1\t\n"
                     "bne 1b\t\n"
                     "2:\t\n"
                     : [dst] "+r" (dst), [y] "+r" (y), [uv] "+r" (uv), [c] "+r" (c)
                     :
                     : "r4", "cc","memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
                     );
        count%=16;
#endif

        /*边角料的处理*/
        int r, g, b;
        while (count > 1)
            {
            unsigned char _y = y[0];
            unsigned char _u = uv[0];
            unsigned char _v = uv[1];
            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;
            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;

            y++;
            dst+=4;
            _y = y[0];

            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;

            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;
            y++;
            uv+=2;
            dst+=4;

            count-=2;
            }
        if (count > 0)
            {
            unsigned char _y = y[0];
            unsigned char _u = uv[0];
            unsigned char _v = uv[1];
            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;
            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;
            }
        }
}

@end
	//
	// ViewCtrl_CPUEasyImageProcessing.m
	// demo_ios
	//
	// Created by yyuser on 2018/10/19.
	//

	#import "ViewCtrl_CPUEasyImageProcessing.h"
	#import "vn_core.h"
	#import "vn_yuvconverter.h"
	#import "VN_Tensor.h"

	@interface ViewCtrl_CPUEasyImageProcessing ()
	@property (nonatomic, strong) UIImageView *imgView;
	@property (nonatomic, assign) Venus::VenusYUVToolKit *yuvConverter;
	@end

	@implementation ViewCtrl_CPUEasyImageProcessing

	- (void)dealloc {
	delete _yuvConverter;
	}

	- (void)loadView {
	[super loadView];
	[self.view addSubview:self.imgView];
	}

	- (void)viewDidLoad {
	_yuvConverter = nullptr;
	[super viewDidLoad];
	// Do any additional setup after loading the view.
	}

	- (UIView *)imgView {
	if (!_imgView) {
	_imgView = [[UIImageView alloc] initWithImage:[UIImage imageWithContentsOfFile:[[[[NSBundle mainBundle] bundlePath] stringByAppendingPathComponent:@"UIResources"] stringByAppendingPathComponent:@"icon.png"]]];
	[_imgView setFrame:CGRectMake(SCREEN_WIDTH * 1 / 9.0 + (ACTUAL_SCREEN_WIDTH - SCREEN_WIDTH) / 2,
	SCREEN_HEIGHT * 2 / 16.0 + (ACTUAL_SCREEN_HEIGHT - SCREEN_HEIGHT) / 2,
	SCREEN_WIDTH * 7.0 / 9.0,
	SCREEN_HEIGHT * 12.0 / 16.0)];
	_imgView.layer.cornerRadius = 8;
	_imgView.layer.masksToBounds = YES;
	_imgView.contentMode = UIViewContentModeScaleAspectFit;
	}
	return _imgView;
	}

	-(UIImage *)getUIImage_With_Height:(int)imH
	Width:(int)imW
	BGRADataU8:(u_char *)data_u8 {
	int imW_align = (imW + 3) / 4 * 4;
	int byte_per_row = imW * 4;
	int byte_per_row_align = imW_align * 4;
	UIGraphicsBeginImageContext(CGSizeMake(imW_align, imH));
	CGContextRef c = UIGraphicsGetCurrentContext();
	u_char* data_write = (u_char*)CGBitmapContextGetData(c);
	u_char* data_read = data_u8;
	if (data_write != NULL && data_read != NULL) {
	for (int y = 0; y < imH; y++) {
	memcpy(data_write, data_read, byte_per_row);
	data_write += byte_per_row_align;
	data_read += byte_per_row;
	}
	}
	UIImage *img = UIGraphicsGetImageFromCurrentImageContext();
	UIGraphicsEndImageContext();
	return img;
	}


	- (void)videoCaptureCallback:(CVPixelBufferRef)pixelBuffer
	{
	VN_Image input;
	input.ori_fmt = VN_ORIENT_FMT_DEFAULT;
	if (CVPixelBufferGetPlaneCount(pixelBuffer) == 0) {
	CVPixelBufferLockBaseAddress(pixelBuffer, 0);
	int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
	int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
	int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
	uint8_t baseAddress = (uint8_t)CVPixelBufferGetBaseAddress(pixelBuffer);
	unsigned char ptr_indata = new unsigned char[iWidth iHeight * 4];
	memcpy(ptr_indata, baseAddress, iWidth * iHeight * 4);
	input.width = iWidth;
	input.height = iHeight;
	input.channels = 4;
	input.pix_fmt = VN_PIX_FMT_BGRA8888;
	input.data = ptr_indata;
	CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
	}
	else {
	CVPixelBufferLockBaseAddress(pixelBuffer, 0);
	int iBytesPerRow = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
	int iHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
	int iWidth = (int)CVPixelBufferGetWidth(pixelBuffer);
	int bytePerRowPlane0 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0);
	int bytePerRowPlane1 = (int)CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1);
	uint8_t baseAddress = (uint8_t)CVPixelBufferGetBaseAddress(pixelBuffer);
	unsigned char ptr_indata = new unsigned char[iWidth iHeight + iWidth * iHeight / 2];//pixelBuffer;//CVPixelBufferGetBaseAddress(pixelBuffer);
	{
	unsigned char *ptr_indata_temp = ptr_indata;
	unsigned char ptr_pixdata_temp0 = (unsigned char )CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
	for (int r = 0; r < iHeight; r++) {
	memcpy(ptr_indata_temp, ptr_pixdata_temp0, iWidth);
	ptr_indata_temp += iWidth;
	ptr_pixdata_temp0 += bytePerRowPlane0;
	}
	}
	{
	unsigned char ptr_indata_temp = ptr_indata + iWidth iHeight;
	unsigned char ptr_pixdata_temp1 = (unsigned char )CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
	for (int r = 0; r < iHeight / 2; r++) {
	memcpy(ptr_indata_temp, ptr_pixdata_temp1, iWidth);
	ptr_indata_temp += iWidth;
	ptr_pixdata_temp1 += bytePerRowPlane1;
	}
	}

	input.width = iWidth;
	input.height = iHeight;
	input.channels = 0;
	input.pix_fmt = VN_PIX_FMT_YUV420F;
	input.data = ptr_indata;
	CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
	}

	VN_Image outCvt; {
	outCvt.channels = 4;
	outCvt.width = 720;
	outCvt.height = 1280;
	outCvt.pix_fmt = VN_PIX_FMT_RGB888;
	outCvt.ori_fmt = VN_ORIENT_FMT_DEFAULT;
	outCvt.data = calloc(outCvt.width * outCvt.height * outCvt.channels, sizeof(float));
	}

	if ((!_yuvConverter) \|\|
	(_yuvConverter->_height_Y != input.height \|\| _yuvConverter->_width_Y != input.width) \|\|
	(_yuvConverter->_height_RGB != outCvt.height \|\| _yuvConverter->_width_RGB != outCvt.width)
	) {

	if (_yuvConverter) {
	delete _yuvConverter;
	_yuvConverter = nullptr;
	}

	_yuvConverter = new Venus::VenusYUVToolKit(
	input.width,
	input.height,
	input.width / 2,
	input.height / 2,
	outCvt.width,
	outCvt.height
	);
	_yuvConverter->setCvtMat(
	Venus::ConvertMatrixFromYUV(
	1.0f, 0.0f, 1.57481f,
	1.0f, -0.18732f, -0.46813f,
	1.0f, 1.8556f, 0.0f,
	-201.57568f,
	83.897598f,
	-237.5168f
	)
	);
	}

	double tic = CACurrentMediaTime();
	//// _yuvConverter->convert(input, outCvt);
	// Venus::VenusYUVToolKit::Convert(input, outCvt, Venus::ConvertMatrixFromYUV(
	// 1.0f, 0.0f, 1.57481f,
	// 1.0f, -0.18732f, -0.46813f,
	// 1.0f, 1.8556f, 0.0f,
	// -201.57568f,
	// 83.897598f,
	// -237.5168f
	// ));


	// nv12_to_rgb_fast_asm_ios((unsigned char )input.data,720, 1280, (unsigned char )outCvt.data);
	convertToRGBA((unsigned char )input.data, 720, 1280, (int )outCvt.data);

	double toc = CACurrentMediaTime();
	LOGV("yuv Convert cost %f ms", 1000 * (toc - tic));

	u_char ptr_u8 = (u_char )calloc(outCvt.width * outCvt.height * 4, sizeof(u_char));
	memset(ptr_u8, 0xff, outCvt.width * outCvt.height * 4 * sizeof(u_char));
	float ptr_in_f32 = (float )outCvt.data;
	u_char *ptr_out_u8 = ptr_u8;
	for (int y = 0; y < outCvt.height; y++) {
	for (int x = 0; x < outCvt.width; x++) {
	for (int c = 0; c < 3; c++) {
	ptr_out_u8++ = static_cast<u_char>(ptr_in_f32[c outCvt.height * outCvt.width +
	y * outCvt.width +
	x]);
	}
	ptr_out_u8++;
	}
	}
	UIImage *img = [self getUIImage_With_Height:outCvt.height Width:outCvt.width BGRADataU8:ptr_u8];
	free(ptr_u8);

	dispatch_async(dispatch_get_main_queue(), ^{
	_imgView.image = img;
	});
	free(input.data);
	free(outCvt.data);
	}

	void nv12_to_rgb_fast_asm_ios(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb){

	const unsigned char* yptr = yuv420sp;
	const unsigned char* uvptr = yuv420sp + w * h;
	#if __ARM_NEON
	int8x8_t _v128 = vdup_n_s8(128);
	int8x8_t _v90 = vdup_n_s8(90);
	int8x8_t _v46 = vdup_n_s8(46);
	int8x8_t _v22 = vdup_n_s8(22);
	int8x8_t _v113 = vdup_n_s8(113);
	#endif

	for (int y=0; y<h; y+=2)
	{
	const unsigned char* yptr0 = yptr;
	const unsigned char* yptr1 = yptr + w;
	unsigned char* rgb0 = rgb;
	unsigned char* rgb1 = rgb + w*3;

	#if __ARM_NEON
	int nn = w >> 3;
	int remain = w - (nn << 3);
	#else
	int remain = w;
	#endif // __ARM_NEON

	#if __ARM_NEON
	#if __aarch64__

	//测试，暂时没问题
	// for (; nn>0; nn--)
	// {
	// int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
	// int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
	//
	// int8x8_t _uuvv = vsub_s8(vreinterpret_s8_u8(vld1_u8(uvptr)), _v128); //uv - 128
	// int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
	// int8x8_t _uu = _uuuuvvvv.val[0];
	// int8x8_t _vv = _uuuuvvvv.val[1];
	//
	// int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
	// int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
	// _g0 = vmlsl_s8(_g0, _uu, _v22);
	// int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
	//
	// int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
	// int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
	// _g1 = vmlsl_s8(_g1, _uu, _v22);
	// int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
	//
	// uint8x8x3_t _rgb0;
	// _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
	// _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
	// _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
	//
	// uint8x8x3_t _rgb1;
	// _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
	// _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
	// _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
	//
	// vst3_u8(rgb0, _rgb0);
	// vst3_u8(rgb1, _rgb1);
	//
	// yptr0 += 8;
	// yptr1 += 8;
	// uvptr += 8;
	// rgb0 += 24;
	// rgb1 += 24;
	// }
	if (nn > 0)
	{
	asm volatile(
	"0: \n"
	"ld1 {v2.8b}, [%3], #8 \n" //uv
	"sub v2.8b, v2.8b, %12.8b \n" //uv - 128
	"ld1 {v0.8b}, [%1], #8 \n" //yptr----r0
	"ld1 {v1.8b}, [%2], #8 \n" //yptr----r1
	"ushll v4.8h, v0.8b, #6 \n" //r0---y<<6
	"orr v3.8b, v2.8b, v2.8b \n" //copy of vu
	"ushll v5.8h, v1.8b, #6 \n" //r1---y<<6
	"orr v9.16b, v4.16b, v4.16b \n" //copy of r0---y<<6
	"trn1 v14.8b, v2.8b, v3.8b \n" //v14 = u
	"trn2 v13.8b, v2.8b, v3.8b \n" //v13 = v
	"orr v11.16b, v5.16b, v5.16b \n" //copy of r1---y<<6
	"smlsl v9.8h, v13.8b, %14.8b \n" // r0---- (y << 6) - v * 46
	"orr v8.16b, v4.16b, v4.16b \n" //copy of r0---y<<6
	"smlsl v11.8h, v13.8b, %14.8b \n" // r1---- (y << 6) - v * 46
	"orr v10.16b, v5.16b, v5.16b \n" //copy of r1---y<<6
	"smlal v8.8h, v13.8b, %13.8b \n" //r0--- r = (y<<6) + v * 90
	"smlal v4.8h, v14.8b, %16.8b \n" //r0--- b = (y<<6) + u * 133
	"smlal v10.8h, v13.8b, %13.8b \n" //r1--- r = (y<<6) + v * 90
	"smlsl v9.8h, v14.8b, %15.8b \n" //r0--- g = (y << 6) - v * 46 - u * 22
	"smlal v5.8h, v14.8b, %16.8b \n" //r1--- b = (y<<6) + u * 133
	"smlsl v11.8h, v14.8b, %15.8b \n" //r1--- g = (y << 6) - v * 46 - u * 22
	"sqshrun v15.8b, v8.8h, #6 \n" //r0--- r
	"sqshrun v17.8b, v4.8h, #6 \n" //r0--- b
	"sqshrun v18.8b, v10.8h, #6 \n" //r1--- r
	"sqshrun v16.8b, v9.8h, #6 \n" //r0--- g
	"sqshrun v20.8b, v5.8h, #6 \n" //r1--- b
	"sqshrun v19.8b, v11.8h, #6 \n" //r1--- g
	"subs %w0, %w0, #1 \n"
	"st3 {v15.8b, v16.8b, v17.8b}, [%4], #24 \n"
	"st3 {v18.8b, v19.8b, v20.8b}, [%5], #24 \n"
	"bne 0b \n"
	: "=r"(nn), // %0
	"=r"(yptr0), // %1
	"=r"(yptr1), // %2
	"=r"(uvptr), // %3
	"=r"(rgb0), // %4
	"=r"(rgb1) // %5
	: "0"(nn),
	"1"(yptr0),
	"2"(yptr1),
	"3"(uvptr),
	"4"(rgb0),
	"5"(rgb1),
	"w"(_v128), // %12
	"w"(_v90), // %13
	"w"(_v46), // %14
	"w"(_v22), // %15
	"w"(_v113) // %16
	: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", "v13",
	"v14", "v15", "v16", "v17", "v18", "v19", "v20"
	);
	}
	#else
	if (nn > 0)
	{
	asm volatile(
	"0: \n"
	"pld [%3, #128] \n"
	"vld1.u8 {d2}, [%3]! \n" //uv
	"vsub.s8 d2, d2, %12 \n" //uv - 128
	"pld [%1, #128] \n"
	"vld1.u8 {d0}, [%1]! \n" //yptr----r0
	"pld [%2, #128] \n"
	"vld1.u8 {d1}, [%2]! \n" //yptr----r1
	"vshll.u8 q2, d0, #6 \n" //r0---y
	"vorr d3, d2, d2 \n"
	"vshll.u8 q3, d1, #6 \n" //r1---y
	"vorr q9, q2, q2 \n" //r0---y
	"vtrn.s8 d2, d3 \n" //d2 = u, d3 = v
	"vorr q11, q3, q3 \n" //r1---y
	"vmlsl.s8 q9, d3, %14 \n" //r0---- y - v * 46
	"vorr q8, q2, q2 \n" //r0---y
	"vmlsl.s8 q11, d3, %14 \n" //r1---- y - v * 46
	"vorr q10, q3, q3 \n" //r1---y
	"vmlal.s8 q8, d3, %13 \n" //r0----r = y + v * 90
	"vmlal.s8 q2, d2, %16 \n" //r0----b = y + u * 133
	"vmlal.s8 q10, d3, %13 \n" //r1----r = y + v * 90
	"vmlsl.s8 q9, d2, %15 \n" //r0----g = (y - v * 46) - u * 22
	"vmlal.s8 q3, d2, %16 \n" //r1----b = y + u * 133
	"vmlsl.s8 q11, d2, %15 \n" //r1----g = (y - v * 46) - u * 22
	"vqshrun.s16 d24, q8, #6 \n" // r0---r
	"vqshrun.s16 d26, q2, #6 \n" // r0---b
	"vqshrun.s16 d4, q10, #6 \n" // r1---r
	"vqshrun.s16 d25, q9, #6 \n" // r0---g
	"vqshrun.s16 d6, q3, #6 \n" // r1---b
	"vqshrun.s16 d5, q11, #6 \n" // r1---g
	"subs %0, #1 \n"
	"vst3.u8 {d24-d26}, [%4]! \n"
	//"vsub.s8 d2, d2, %12 \n"
	"vst3.u8 {d4-d6}, [%5]! \n"
	"bne 0b \n"
	: "=r"(nn), // %0
	"=r"(yptr0), // %1
	"=r"(yptr1), // %2
	"=r"(uvptr), // %3
	"=r"(rgb0), // %4
	"=r"(rgb1) // %5
	: "0"(nn),
	"1"(yptr0),
	"2"(yptr1),
	"3"(uvptr),
	"4"(rgb0),
	"5"(rgb1),
	"w"(_v128), // %12
	"w"(_v90), // %13
	"w"(_v46), // %14
	"w"(_v22), // %15
	"w"(_v113) // %16
	: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
	);
	}
	#endif // __aarch64__
	#endif // __ARM_NEON
	// remain = w;
	for (; remain>0; remain = remain - 2)
	{
	// R = 1.164 * yy + 1.596 * vv
	// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
	// B = 1.164 * yy + 2.018 * uu

	// R = Y + (1.370705 * (V-128))
	// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
	// B = Y + (1.732446 * (U-128))

	// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
	// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
	// B = ((Y << 6) + 110.876544 * (U-128)) >> 6

	// R = ((Y << 6) + 90 * (V-128)) >> 6
	// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
	// B = ((Y << 6) + 113 * (U-128)) >> 6

	// R = (yy + 90 * vv) >> 6
	// G = (yy - 46 * vv - 22 * uu) >> 6
	// B = (yy + 113 * uu) >> 6

	int u = uvptr[0] - 128;
	int v = uvptr[1] - 128;

	int ruv = 90 * v;
	int guv = -46 * v + -22 * u;
	int buv = 113 * u;

	int y00 = yptr0[0] << 6;
	rgb0[0] = static_cast<unsigned char>((y00 + ruv) >> 6);
	rgb0[1] = static_cast<unsigned char>((y00 + guv) >> 6);
	rgb0[2] = static_cast<unsigned char>((y00 + buv) >> 6);

	int y01 = yptr0[1] << 6;
	rgb0[3] = static_cast<unsigned char>((y01 + ruv) >> 6);
	rgb0[4] = static_cast<unsigned char>((y01 + guv) >> 6);
	rgb0[5] = static_cast<unsigned char>((y01 + buv) >> 6);

	int y10 = yptr1[0] << 6;
	rgb1[0] = static_cast<unsigned char>((y10 + ruv) >> 6);
	rgb1[1] = static_cast<unsigned char>((y10 + guv) >> 6);
	rgb1[2] = static_cast<unsigned char>((y10 + buv) >> 6);

	int y11 = yptr1[1] << 6;
	rgb1[3] = static_cast<unsigned char>((y11 + ruv) >> 6);
	rgb1[4] = static_cast<unsigned char>((y11 + guv) >> 6);
	rgb1[5] = static_cast<unsigned char>((y11 + buv) >> 6);

	yptr0 += 2;
	yptr1 += 2;
	uvptr += 2;
	rgb0 += 6;
	rgb1 += 6;
	}

	yptr += 2*w;
	rgb += 23w;
	}
	}
	void convertToRGBA(unsigned char* yuv, int w, int h, int* rgba)
	{
	for (int i=0; i<h; ++i)
	{
	unsigned char* dst = (unsigned char)(rgba + wi);
	unsigned char* y = yuv + w*i;
	unsigned char* uv = yuv + wh + w(i/2);
	int count = w;
	#ifdef HAS_NEON
	/一次处理16个像素/
	int c = count/16;
	asm volatile(
	"movs r4, %[c]\t\n"
	"beq 2f\t\n"
	"vmov.u8 d7, #255\t\n"//Alpha
	"vmov.u8 d3, #255\t\n"//Alpha
	"vmov.s16 q11, #90\t\n"
	"vmov.s16 q12, #128\t\n"
	"vmov.s16 q13, #21\t\n"
	"vmov.s16 q14, #46\t\n"
	"vmov.s16 q15, #113\t\n"
	"1:\t\n"
	/Y1 Y2 是交错的两组像素的Y分量，与 UV分量值正好一一对应/
	"vld2.8 {d8, d9}, [%[y]]!\t\n"//Y1, Y2
	/交错取出 UV 值/
	"vld2.8 {d0, d1}, [%[uv]]!\t\n"//u, v
	"vmovl.u8 q5, d0\t\n"
	"vmovl.u8 q6, d1\t\n"
	"vsub.i16 q5,q5, q12\t\n"//U
	"vsub.i16 q6,q6, q12\t\n"//V
	//First RGBA
	"vshll.u8 q7, d8, #6\t\n"
	"vshll.u8 q8, d8, #6\t\n"
	"vshll.u8 q9, d8, #6\t\n"
	"vmla.i16 q7, q6, q11\t\n"
	"vmls.i16 q8, q5, q13\t\n"
	"vmls.i16 q8, q6, q14\t\n"
	"vmla.i16 q9, q5, q15\t\n"

	"vshr.s16 q7, q7, #6\t\n"
	"vshr.s16 q8, q8, #6\t\n"
	"vshr.s16 q9, q9, #6\t\n"
	"vmov.s16 q10, #0\t\n"
	"vmax.s16 q7, q7, q10\t\n"
	"vmax.s16 q8, q8, q10\t\n"
	"vmax.s16 q9, q9, q10\t\n"
	"vmov.u16 q10, #255\t\n"
	"vmin.u16 q7, q7, q10\t\n"
	"vmin.u16 q8, q8, q10\t\n"
	"vmin.u16 q9, q9, q10\t\n"
	"vmovn.s16 d2, q7\t\n"
	"vmovn.s16 d1, q8\t\n"
	"vmovn.s16 d0, q9\t\n"

	//Second RGBA
	"vshll.u8 q7, d9, #6\t\n"
	"vshll.u8 q8, d9, #6\t\n"
	"vshll.u8 q9, d9, #6\t\n"
	"vmla.i16 q7, q6, q11\t\n"
	"vmls.i16 q8, q5, q13\t\n"
	"vmls.i16 q8, q6, q14\t\n"
	"vmla.i16 q9, q5, q15\t\n"
	"vshr.s16 q7, q7, #6\t\n"
	"vshr.s16 q8, q8, #6\t\n"
	"vshr.s16 q9, q9, #6\t\n"
	"vmov.s16 q10, #0\t\n"
	"vmax.s16 q7, q7, q10\t\n"
	"vmax.s16 q8, q8, q10\t\n"
	"vmax.s16 q9, q9, q10\t\n"
	"vmov.u16 q10, #255\t\n"
	"vmin.u16 q7, q7, q10\t\n"
	"vmin.u16 q8, q8, q10\t\n"
	"vmin.u16 q9, q9, q10\t\n"
	"vmovn.s16 d6, q7\t\n"
	"vmovn.s16 d5, q8\t\n"
	"vmovn.s16 d4, q9\t\n"
	/*目前我们得到的两组RGB分量值是交错的，
	* 比如：
	* d0 : (g0 g2 g4 g6 g8 g10 g12 g14)
	* d4 : (g1 g3 g5 g7 g9 g11 g13 g15)
	* 需要做交织，变成如下再存储：
	* d0 ：(g0 g1 g2 g3 g4 g5 g6 g7)
	* d4 ：(g8 g9 g10 g11 g12 g13 g14 g15)
	*/

	"vtrn.8 d2,d6\t\n"
	"vtrn.16 d2,d6\t\n"
	"vtrn.32 d2,d6\t\n"

	"vtrn.8 d1,d5\t\n"
	"vtrn.16 d1,d5\t\n"
	"vtrn.32 d1,d5\t\n"

	"vtrn.8 d0,d4\t\n"
	"vtrn.16 d0,d4\t\n"
	"vtrn.32 d0,d4\t\n"

	"vst4.8 {d0-d3}, [%[dst]]!\t\n"
	"vst4.8 {d4-d7}, [%[dst]]!\t\n"

	"subs r4, r4, #1\t\n"
	"bne 1b\t\n"
	"2:\t\n"
	: [dst] "+r" (dst), [y] "+r" (y), [uv] "+r" (uv), [c] "+r" (c)
	:
	: "r4", "cc","memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
	);
	count%=16;
	#endif

	/边角料的处理/
	int r, g, b;
	while (count > 1)
	{
	unsigned char _y = y[0];
	unsigned char _u = uv[0];
	unsigned char _v = uv[1];
	r = _y + ((179*(_v-128))>>7);
	g = _y - ((43(_u-128) - 91(_v-128))>>7);
	b = _y + ((227*(_u-128))>>7);
	r = r<0?0:r;r=r>255?255:r;
	g = g<0?0:g;g=g>255?255:g;
	b = b<0?0:b;b=b>255?255:b;
	dst[0] = b;
	dst[1] = g;
	dst[2] = r;
	dst[3] = 0xFF;

	y++;
	dst+=4;
	_y = y[0];

	r = _y + ((179*(_v-128))>>7);
	g = _y - ((43(_u-128) - 91(_v-128))>>7);
	b = _y + ((227*(_u-128))>>7);
	r = r<0?0:r;r=r>255?255:r;
	g = g<0?0:g;g=g>255?255:g;
	b = b<0?0:b;b=b>255?255:b;

	dst[0] = b;
	dst[1] = g;
	dst[2] = r;
	dst[3] = 0xFF;
	y++;
	uv+=2;
	dst+=4;

	count-=2;
	}
	if (count > 0)
	{
	unsigned char _y = y[0];
	unsigned char _u = uv[0];
	unsigned char _v = uv[1];
	r = _y + ((179*(_v-128))>>7);
	g = _y - ((43(_u-128) - 91(_v-128))>>7);
	b = _y + ((227*(_u-128))>>7);
	r = r<0?0:r;r=r>255?255:r;
	g = g<0?0:g;g=g>255?255:g;
	b = b<0?0:b;b=b>255?255:b;
	dst[0] = b;
	dst[1] = g;
	dst[2] = r;
	dst[3] = 0xFF;
	}
	}
	}

	@end