Skip to content

Instantly share code, notes, and snippets.

@sujankh
Created March 1, 2020 02:46

Revisions

  1. sujankh created this gist Mar 1, 2020.
    250 changes: 250 additions & 0 deletions skhadka.patch
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,250 @@
    From 2bec09b4eb46dd823ec93f699eaa2b3acc73ad30 Mon Sep 17 00:00:00 2001
    From: Sujan Khadka <skhadka@uber.com>
    Date: Mon, 29 Apr 2019 00:39:26 -0400
    Subject: [PATCH] Lookup tables GET_BIT instead of conditionals

    ---
    MultiSource/Benchmarks/7zip/C/LzmaDec.c | 108 ++++++++++++++++++++----
    1 file changed, 91 insertions(+), 17 deletions(-)

    diff --git a/MultiSource/Benchmarks/7zip/C/LzmaDec.c b/MultiSource/Benchmarks/7zip/C/LzmaDec.c
    index 2036761b..3f808383 100644
    --- a/MultiSource/Benchmarks/7zip/C/LzmaDec.c
    +++ b/MultiSource/Benchmarks/7zip/C/LzmaDec.c
    @@ -22,7 +22,38 @@
    #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
    { UPDATE_0(p); i = (i + i); A0; } else \
    { UPDATE_1(p); i = (i + i) + 1; A1; }
    -#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;)
    +
    +/*
    +This macro is a re-write of GET_BIT2 but does not use conditional if(code<bound)
    +This condition is found to cause lots of branch mis-predictions
    +This condition is replaced into a table-lookup.
    +upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound
    +upd_var is the variable to be set to 1th index of upd_vals_table if code<bound
    +*/
    +#define GET_BIT2_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \
    + if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } \
    + bound = (range >> kNumBitModelTotalBits) * ttt; \
    + { \
    + UInt32 code_table[2] = { code - bound, code}; \
    + UInt32 range_table[2] = { range - bound, bound}; \
    + unsigned prob_table[2] = { (ttt - (ttt >> kNumMoveBits)) , \
    + (ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)) \
    + }; \
    + unsigned i_table[2] = { (i + i) + 1, (i + i) }; \
    + int cond = (code < bound); \
    + range = range_table[cond]; \
    + code = code_table[cond]; \
    + *(p) = (CLzmaProb) (prob_table[cond]); \
    + i = i_table[cond]; \
    + upd_var = upd_vals_table[cond]; \
    + }
    +
    +char DUMMY;
    +char DUMMY_ARRAY[2] = {'0', '1'};
    +
    +// GET_BIT defaults to calling AVOID_BRANCH given that almost everything that calls it is inside lots of loops
    +// No variable needs to be updated, so set it to some dummy array (although this might add up one more instruction)
    +#define GET_BIT(p, i) GET_BIT2_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY)

    #define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); }
    #define TREE_DECODE(probs, limit, i) \
    @@ -44,6 +75,28 @@
    i -= 0x40; }
    #endif

    +/*
    +This macro is a re-write of GET_BIT2_CHECK but does not use conditional if(code<bound)
    +This condition is found to cause lots of branch mis-predictions
    +This condition is replaced into a table-lookup.
    +upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound
    +upd_var is the variable to be set to 1th index of upd_vals_table if code<bound
    +*/
    +
    +#define GET_BIT2_CHECK_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \
    + if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } \
    + bound = (range >> kNumBitModelTotalBits) * ttt; \
    + UInt32 code_table[2] = { code - bound, code}; \
    + UInt32 range_table[2] = { range - bound, bound}; \
    + unsigned symbol_table[2] = { (i + i) + 1, (i + i) }; \
    + int cond = (code < bound); \
    + range = range_table[cond]; \
    + code = code_table[cond]; \
    + i = symbol_table[cond]; \
    + upd_var = upd_vals_table[cond];
    +
    +#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY)
    +
    #define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); }

    #define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound)
    @@ -52,7 +105,7 @@
    #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
    { UPDATE_0_CHECK; i = (i + i); A0; } else \
    { UPDATE_1_CHECK; i = (i + i) + 1; A1; }
    -#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
    +
    #define TREE_DECODE_CHECK(probs, limit, i) \
    { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }

    @@ -128,6 +181,7 @@ Out:
    = kMatchSpecLenStart + 2 : State Init Marker
    */

    +__attribute__((noinline))
    static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
    {
    CLzmaProb *probs = p->probs;
    @@ -141,7 +195,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    Byte *dic = p->dic;
    SizeT dicBufSize = p->dicBufSize;
    SizeT dicPos = p->dicPos;
    -
    +
    UInt32 processedPos = p->processedPos;
    UInt32 checkDicSize = p->checkDicSize;
    unsigned len = 0;
    @@ -179,6 +233,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    unsigned offs = 0x100;
    state -= (state < 10) ? 3 : 6;
    symbol = 1;
    + unsigned offs_arr[2];
    do
    {
    unsigned bit;
    @@ -186,7 +241,9 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    matchByte <<= 1;
    bit = (matchByte & offs);
    probLit = prob + offs + bit + symbol;
    - GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit)
    + offs_arr[0] = (offs & bit); // False
    + offs_arr[1] = (offs & ~bit); // True
    + GET_BIT2_AVOID_BRANCH(probLit, symbol, offs, offs_arr)
    }
    while (symbol < 0x100);
    }
    @@ -309,10 +366,14 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    {
    UInt32 mask = 1;
    unsigned i = 1;
    + UInt32 distance_arr[2];
    do
    {
    - GET_BIT2(prob + i, i, ; , distance |= mask);
    - mask <<= 1;
    + distance_arr[0] = (distance | mask); //False
    + distance_arr[1] = distance; //True = no update
    +
    + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
    + mask <<= 1;
    }
    while (--numDirectBits != 0);
    }
    @@ -324,7 +385,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    {
    NORMALIZE
    range >>= 1;
    -
    +
    {
    UInt32 t;
    code -= range;
    @@ -344,12 +405,20 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
    while (--numDirectBits != 0);
    prob = probs + Align;
    distance <<= kNumAlignBits;
    + UInt32 distance_arr[2];
    {
    unsigned i = 1;
    - GET_BIT2(prob + i, i, ; , distance |= 1);
    - GET_BIT2(prob + i, i, ; , distance |= 2);
    - GET_BIT2(prob + i, i, ; , distance |= 4);
    - GET_BIT2(prob + i, i, ; , distance |= 8);
    + distance_arr[0] = (distance | 1); distance_arr[1] = distance;
    + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
    +
    + distance_arr[0] = (distance | 2); distance_arr[1] = distance;
    + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
    +
    + distance_arr[0] = (distance | 4); distance_arr[1] = distance;
    + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
    +
    + distance_arr[0] = (distance | 8); distance_arr[1] = distance;
    + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
    }
    if (distance == (UInt32)0xFFFFFFFF)
    {
    @@ -451,6 +520,7 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
    }
    }

    +__attribute__((noinline))
    static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
    {
    do
    @@ -523,6 +593,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
    ((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)];
    unsigned offs = 0x100;
    unsigned symbol = 1;
    + unsigned offs_selection[2];
    do
    {
    unsigned bit;
    @@ -530,7 +601,10 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
    matchByte <<= 1;
    bit = (matchByte & offs);
    probLit = prob + offs + bit + symbol;
    - GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
    +
    + offs_selection[0] = (offs & bit); //False
    + offs_selection[1] = (offs & ~bit); //True
    + GET_BIT2_CHECK_AVOID_BRANCH(probLit, symbol, offs, offs_selection)
    }
    while (symbol < 0x100);
    }
    @@ -722,7 +796,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
    SizeT inSize = *srcLen;
    (*srcLen) = 0;
    LzmaDec_WriteRem(p, dicLimit);
    -
    +
    *status = LZMA_STATUS_NOT_SPECIFIED;

    while (p->remainLen != kMatchSpecLenStart)
    @@ -768,7 +842,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr

    if (p->needInitState)
    LzmaDec_InitStateReal(p);
    -
    +
    if (p->tempBufSize == 0)
    {
    SizeT processed;
    @@ -899,12 +973,12 @@ SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size)
    {
    UInt32 dicSize;
    Byte d;
    -
    +
    if (size < LZMA_PROPS_SIZE)
    return SZ_ERROR_UNSUPPORTED;
    else
    dicSize = data[1] | ((UInt32)data[2] << 8) | ((UInt32)data[3] << 16) | ((UInt32)data[4] << 24);
    -
    +
    if (dicSize < LZMA_DIC_MIN)
    dicSize = LZMA_DIC_MIN;
    p->dicSize = dicSize;
    @@ -986,7 +1060,7 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
    p.dicBufSize = outSize;

    LzmaDec_Init(&p);
    -
    +
    *srcLen = inSize;
    res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status);

    --
    2.17.1