Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created December 16, 2014 16:25
Embed
What would you like to do?
stb_image SSE IDCT sneak preview
// dot product constant: even elems=x, odd elems=y
#define dpconst(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
// out(1) = c1[even]*x + c1[odd]*y
#define dct_rot(out0,out1, x,y,c0,c1) \
__m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
__m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
// out = in << 12 (in 16-bit, out 32-bit)
#define dct_widen(out, in) \
__m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
__m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
// wide add
#define dct_wadd(out, a, b) \
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
// wide sub
#define dct_wsub(out, a, b) \
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
// butterfly a/b, add bias, then shift by "s" and pack
#define dct_bfly32o(out0, out1, a,b,bias,s) \
{ \
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
dct_wadd(sum, abiased, b); \
dct_wsub(dif, abiased, b); \
out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
}
// 8-bit interleave step (for transposes)
#define dct_interleave8(out0,out1, in0,in1) \
out0 = _mm_unpacklo_epi8(in0, in1); \
out1 = _mm_unpackhi_epi8(in0, in1)
// 16-bit interleave step (for transposes)
#define dct_interleave16(out0,out1, in0,in1) \
out0 = _mm_unpacklo_epi16(in0, in1); \
out1 = _mm_unpackhi_epi16(in0, in1)
#define dct_pass(row,bias,shift) \
{ \
/* even part */ \
dct_rot(t2e,t3e, row[2],row[6], rot0_0,rot0_1); \
__m128i sum04 = _mm_add_epi16(row[0], row[4]); \
__m128i dif04 = _mm_sub_epi16(row[0], row[4]); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
/* odd part */ \
__m128i row7 = row[7]; \
__m128i row5 = row[5]; \
__m128i row3 = row[3]; \
__m128i row1 = row[1]; \
dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
__m128i sum17 = _mm_add_epi16(row1, row7); \
__m128i sum35 = _mm_add_epi16(row3, row5); \
dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
dct_wadd(x4, y0o, y4o); \
dct_wadd(x5, y1o, y5o); \
dct_wadd(x6, y2o, y5o); \
dct_wadd(x7, y3o, y4o); \
dct_bfly32o(row[0],row[7], x0,x7,bias,shift); \
dct_bfly32o(row[1],row[6], x1,x6,bias,shift); \
dct_bfly32o(row[2],row[5], x2,x5,bias,shift); \
dct_bfly32o(row[3],row[4], x3,x4,bias,shift); \
}
static void my_IDCT(stbi_uc *out, int out_stride, short data[64], unsigned short *dequantize)
{
// This is constructed to match the IJG slow IDCT exactly.
// ("dequantize" ignored for now since caller always passes all-1s.)
__m128i rot0_0 = dpconst(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
__m128i rot0_1 = dpconst(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
__m128i rot1_0 = dpconst(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
__m128i rot1_1 = dpconst(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
__m128i rot2_0 = dpconst(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
__m128i rot2_1 = dpconst(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
__m128i rot3_0 = dpconst(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
__m128i rot3_1 = dpconst(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
// Rounding biases in column/row passes.
// See stbi__idct_block for explanation.
__m128i bias_0 = _mm_set1_epi32(512);
__m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
__m128i row[8];
int i;
// load
for (i = 0; i < 8; i++)
row[i] = _mm_load_si128((const __m128i *) (data + i*8));
// column pass
dct_pass(row, bias_0, 10);
{
// 16bit 8x8 transpose pass 1
dct_interleave16(__m128i a0,__m128i a4, row[0],row[4]);
dct_interleave16(__m128i a1,__m128i a5, row[1],row[5]);
dct_interleave16(__m128i a2,__m128i a6, row[2],row[6]);
dct_interleave16(__m128i a3,__m128i a7, row[3],row[7]);
// transpose pass 2
dct_interleave16(__m128i b0,__m128i b2, a0,a2);
dct_interleave16(__m128i b1,__m128i b3, a1,a3);
dct_interleave16(__m128i b4,__m128i b6, a4,a6);
dct_interleave16(__m128i b5,__m128i b7, a5,a7);
// transpose pass 3
dct_interleave16(row[0],row[1], b0,b1);
dct_interleave16(row[2],row[3], b2,b3);
dct_interleave16(row[4],row[5], b4,b5);
dct_interleave16(row[6],row[7], b6,b7);
}
// row pass
dct_pass(row, bias_1, 17);
{
// pack
__m128i a0 = _mm_packus_epi16(row[0], row[1]); // a0a1a2a3...a7b0b1b2b3...b7
__m128i a1 = _mm_packus_epi16(row[2], row[3]);
__m128i a2 = _mm_packus_epi16(row[4], row[5]);
__m128i a3 = _mm_packus_epi16(row[6], row[7]);
// 8bit 8x8 transpose pass 1
dct_interleave8(__m128i b0,__m128i b2, a0,a2); // a0e0a1e1...
dct_interleave8(__m128i b1,__m128i b3, a1,a3); // c0g0c1g1...
// transpose pass 2
dct_interleave8(__m128i c0,__m128i c1, b0,b1); // a0c0e0g0...
dct_interleave8(__m128i c2,__m128i c3, b2,b3); // b0d0f0h0...
// transpose pass 3
dct_interleave8(__m128i d0,__m128i d2, c0,c2); // a0b0c0d0...
dct_interleave8(__m128i d1,__m128i d3, c1,c3); // a4b4c4d4...
// store
_mm_storel_epi64((__m128i *) out, d0); out += out_stride;
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d0, 0x4e)); out += out_stride;
_mm_storel_epi64((__m128i *) out, d2); out += out_stride;
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d2, 0x4e)); out += out_stride;
_mm_storel_epi64((__m128i *) out, d1); out += out_stride;
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d1, 0x4e)); out += out_stride;
_mm_storel_epi64((__m128i *) out, d3); out += out_stride;
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(d3, 0x4e));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment