stb_image SSE IDCT sneak preview
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // dot product constant: even elems=x, odd elems=y | |
| #define dpconst(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) | |
| // out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) | |
| // out(1) = c1[even]*x + c1[odd]*y | |
| #define dct_rot(out0,out1, x,y,c0,c1) \ | |
| __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ | |
| __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ | |
| __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ | |
| __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ | |
| __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ | |
| __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) | |
| // out = in << 12 (in 16-bit, out 32-bit) | |
| #define dct_widen(out, in) \ | |
| __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ | |
| __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) | |
| // wide add | |
| #define dct_wadd(out, a, b) \ | |
| __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ | |
| __m128i out##_h = _mm_add_epi32(a##_h, b##_h) | |
| // wide sub | |
| #define dct_wsub(out, a, b) \ | |
| __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ | |
| __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) | |
| // butterfly a/b, add bias, then shift by "s" and pack | |
| #define dct_bfly32o(out0, out1, a,b,bias,s) \ | |
| { \ | |
| __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ | |
| __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ | |
| dct_wadd(sum, abiased, b); \ | |
| dct_wsub(dif, abiased, b); \ | |
| out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ | |
| out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ | |
| } | |
| // 8-bit interleave step (for transposes) | |
| #define dct_interleave8(out0,out1, in0,in1) \ | |
| out0 = _mm_unpacklo_epi8(in0, in1); \ | |
| out1 = _mm_unpackhi_epi8(in0, in1) | |
| // 16-bit interleave step (for transposes) | |
| #define dct_interleave16(out0,out1, in0,in1) \ | |
| out0 = _mm_unpacklo_epi16(in0, in1); \ | |
| out1 = _mm_unpackhi_epi16(in0, in1) | |
| #define dct_pass(row,bias,shift) \ | |
| { \ | |
| /* even part */ \ | |
| dct_rot(t2e,t3e, row[2],row[6], rot0_0,rot0_1); \ | |
| __m128i sum04 = _mm_add_epi16(row[0], row[4]); \ | |
| __m128i dif04 = _mm_sub_epi16(row[0], row[4]); \ | |
| dct_widen(t0e, sum04); \ | |
| dct_widen(t1e, dif04); \ | |
| dct_wadd(x0, t0e, t3e); \ | |
| dct_wsub(x3, t0e, t3e); \ | |
| dct_wadd(x1, t1e, t2e); \ | |
| dct_wsub(x2, t1e, t2e); \ | |
| /* odd part */ \ | |
| __m128i row7 = row[7]; \ | |
| __m128i row5 = row[5]; \ | |
| __m128i row3 = row[3]; \ | |
| __m128i row1 = row[1]; \ | |
| dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ | |
| dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ | |
| __m128i sum17 = _mm_add_epi16(row1, row7); \ | |
| __m128i sum35 = _mm_add_epi16(row3, row5); \ | |
| dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ | |
| dct_wadd(x4, y0o, y4o); \ | |
| dct_wadd(x5, y1o, y5o); \ | |
| dct_wadd(x6, y2o, y5o); \ | |
| dct_wadd(x7, y3o, y4o); \ | |
| dct_bfly32o(row[0],row[7], x0,x7,bias,shift); \ | |
| dct_bfly32o(row[1],row[6], x1,x6,bias,shift); \ | |
| dct_bfly32o(row[2],row[5], x2,x5,bias,shift); \ | |
| dct_bfly32o(row[3],row[4], x3,x4,bias,shift); \ | |
| } | |
| static void my_IDCT(stbi_uc *out, int out_stride, short data[64], unsigned short *dequantize) | |
| { | |
| // This is constructed to match the IJG slow IDCT exactly. | |
| // ("dequantize" ignored for now since caller always passes all-1s.) | |
| __m128i rot0_0 = dpconst(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); | |
| __m128i rot0_1 = dpconst(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f)); | |
| __m128i rot1_0 = dpconst(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f)); | |
| __m128i rot1_1 = dpconst(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); | |
| __m128i rot2_0 = dpconst(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f)); | |
| __m128i rot2_1 = dpconst(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f)); | |
| __m128i rot3_0 = dpconst(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f)); | |
| __m128i rot3_1 = dpconst(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f)); | |
| // Rounding biases in column/row passes. | |
| // See stbi__idct_block for explanation. | |
| __m128i bias_0 = _mm_set1_epi32(512); | |
| __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17)); | |
| __m128i row[8]; | |
| int i; | |
| // load | |
| for (i = 0; i < 8; i++) | |
| row[i] = _mm_load_si128((const __m128i *) (data + i*8)); | |
| // column pass | |
| dct_pass(row, bias_0, 10); | |
| { | |
| // 16bit 8x8 transpose pass 1 | |
| dct_interleave16(__m128i a0,__m128i a4, row[0],row[4]); | |
| dct_interleave16(__m128i a1,__m128i a5, row[1],row[5]); | |
| dct_interleave16(__m128i a2,__m128i a6, row[2],row[6]); | |
| dct_interleave16(__m128i a3,__m128i a7, row[3],row[7]); | |
| // transpose pass 2 | |
| dct_interleave16(__m128i b0,__m128i b2, a0,a2); | |
| dct_interleave16(__m128i b1,__m128i b3, a1,a3); | |
| dct_interleave16(__m128i b4,__m128i b6, a4,a6); | |
| dct_interleave16(__m128i b5,__m128i b7, a5,a7); | |
| // transpose pass 3 | |
| dct_interleave16(row[0],row[1], b0,b1); | |
| dct_interleave16(row[2],row[3], b2,b3); | |
| dct_interleave16(row[4],row[5], b4,b5); | |
| dct_interleave16(row[6],row[7], b6,b7); | |
| } | |
| // row pass | |
| dct_pass(row, bias_1, 17); | |
| { | |
| // pack | |
| __m128i a0 = _mm_packus_epi16(row[0], row[1]); // a0a1a2a3...a7b0b1b2b3...b7 | |
| __m128i a1 = _mm_packus_epi16(row[2], row[3]); | |
| __m128i a2 = _mm_packus_epi16(row[4], row[5]); | |
| __m128i a3 = _mm_packus_epi16(row[6], row[7]); | |
| // 8bit 8x8 transpose pass 1 | |
| dct_interleave8(__m128i b0,__m128i b2, a0,a2); // a0e0a1e1... | |
| dct_interleave8(__m128i b1,__m128i b3, a1,a3); // c0g0c1g1... | |
| // transpose pass 2 | |
| dct_interleave8(__m128i c0,__m128i c1, b0,b1); // a0c0e0g0... | |
| dct_interleave8(__m128i c2,__m128i c3, b2,b3); // b0d0f0h0... | |
| // transpose pass 3 | |
| dct_interleave8(__m128i d0,__m128i d2, c0,c2); // a0b0c0d0... | |
| dct_interleave8(__m128i d1,__m128i d3, c1,c3); // a4b4c4d4... | |
| // store | |
| _mm_storel_epi64((__m128i *) out, d0); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d0, 0x4e)); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, d2); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d2, 0x4e)); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, d1); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d1, 0x4e)); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, d3); out += out_stride; | |
| _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d3, 0x4e)); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment