Created
March 1, 2020 02:46
Revisions
-
sujankh created this gist
Mar 1, 2020 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,250 @@ From 2bec09b4eb46dd823ec93f699eaa2b3acc73ad30 Mon Sep 17 00:00:00 2001 From: Sujan Khadka <skhadka@uber.com> Date: Mon, 29 Apr 2019 00:39:26 -0400 Subject: [PATCH] Lookup tables GET_BIT instead of conditionals --- MultiSource/Benchmarks/7zip/C/LzmaDec.c | 108 ++++++++++++++++++++---- 1 file changed, 91 insertions(+), 17 deletions(-) diff --git a/MultiSource/Benchmarks/7zip/C/LzmaDec.c b/MultiSource/Benchmarks/7zip/C/LzmaDec.c index 2036761b..3f808383 100644 --- a/MultiSource/Benchmarks/7zip/C/LzmaDec.c +++ b/MultiSource/Benchmarks/7zip/C/LzmaDec.c @@ -22,7 +22,38 @@ #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ { UPDATE_0(p); i = (i + i); A0; } else \ { UPDATE_1(p); i = (i + i) + 1; A1; } -#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;) + +/* +This macro is a re-write of GET_BIT2 but does not use conditional if(code<bound) +This condition is found to cause lots of branch mis-predictions +This condition is replaced into a table-lookup. +upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound +upd_var is the variable to be set to 1th index of upd_vals_table if code<bound +*/ +#define GET_BIT2_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \ + if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } \ + bound = (range >> kNumBitModelTotalBits) * ttt; \ + { \ + UInt32 code_table[2] = { code - bound, code}; \ + UInt32 range_table[2] = { range - bound, bound}; \ + unsigned prob_table[2] = { (ttt - (ttt >> kNumMoveBits)) , \ + (ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)) \ + }; \ + unsigned i_table[2] = { (i + i) + 1, (i + i) }; \ + int cond = (code < bound); \ + range = range_table[cond]; \ + code = code_table[cond]; \ + *(p) = (CLzmaProb) (prob_table[cond]); \ + i = i_table[cond]; \ + upd_var = upd_vals_table[cond]; \ + } + +char DUMMY; +char DUMMY_ARRAY[2] = {'0', '1'}; + +// GET_BIT defaults to calling AVOID_BRANCH given that almost everything that calls it is inside lots of loops +// No variable needs to be updated, so set it to some dummy array (although this might add up one more instruction) +#define GET_BIT(p, i) GET_BIT2_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY) #define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); } #define TREE_DECODE(probs, limit, i) \ @@ -44,6 +75,28 @@ i -= 0x40; } #endif +/* +This macro is a re-write of GET_BIT2_CHECK but does not use conditional if(code<bound) +This condition is found to cause lots of branch mis-predictions +This condition is replaced into a table-lookup. +upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound +upd_var is the variable to be set to 1th index of upd_vals_table if code<bound +*/ + +#define GET_BIT2_CHECK_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \ + if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } \ + bound = (range >> kNumBitModelTotalBits) * ttt; \ + UInt32 code_table[2] = { code - bound, code}; \ + UInt32 range_table[2] = { range - bound, bound}; \ + unsigned symbol_table[2] = { (i + i) + 1, (i + i) }; \ + int cond = (code < bound); \ + range = range_table[cond]; \ + code = code_table[cond]; \ + i = symbol_table[cond]; \ + upd_var = upd_vals_table[cond]; + +#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY) + #define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } #define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) @@ -52,7 +105,7 @@ #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ { UPDATE_0_CHECK; i = (i + i); A0; } else \ { UPDATE_1_CHECK; i = (i + i) + 1; A1; } -#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) + #define TREE_DECODE_CHECK(probs, limit, i) \ { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } @@ -128,6 +181,7 @@ Out: = kMatchSpecLenStart + 2 : State Init Marker */ +__attribute__((noinline)) static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte *bufLimit) { CLzmaProb *probs = p->probs; @@ -141,7 +195,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte Byte *dic = p->dic; SizeT dicBufSize = p->dicBufSize; SizeT dicPos = p->dicPos; - + UInt32 processedPos = p->processedPos; UInt32 checkDicSize = p->checkDicSize; unsigned len = 0; @@ -179,6 +233,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte unsigned offs = 0x100; state -= (state < 10) ? 3 : 6; symbol = 1; + unsigned offs_arr[2]; do { unsigned bit; @@ -186,7 +241,9 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte matchByte <<= 1; bit = (matchByte & offs); probLit = prob + offs + bit + symbol; - GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) + offs_arr[0] = (offs & bit); // False + offs_arr[1] = (offs & ~bit); // True + GET_BIT2_AVOID_BRANCH(probLit, symbol, offs, offs_arr) } while (symbol < 0x100); } @@ -309,10 +366,14 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte { UInt32 mask = 1; unsigned i = 1; + UInt32 distance_arr[2]; do { - GET_BIT2(prob + i, i, ; , distance |= mask); - mask <<= 1; + distance_arr[0] = (distance | mask); //False + distance_arr[1] = distance; //True = no update + + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); + mask <<= 1; } while (--numDirectBits != 0); } @@ -324,7 +385,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte { NORMALIZE range >>= 1; - + { UInt32 t; code -= range; @@ -344,12 +405,20 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte while (--numDirectBits != 0); prob = probs + Align; distance <<= kNumAlignBits; + UInt32 distance_arr[2]; { unsigned i = 1; - GET_BIT2(prob + i, i, ; , distance |= 1); - GET_BIT2(prob + i, i, ; , distance |= 2); - GET_BIT2(prob + i, i, ; , distance |= 4); - GET_BIT2(prob + i, i, ; , distance |= 8); + distance_arr[0] = (distance | 1); distance_arr[1] = distance; + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); + + distance_arr[0] = (distance | 2); distance_arr[1] = distance; + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); + + distance_arr[0] = (distance | 4); distance_arr[1] = distance; + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); + + distance_arr[0] = (distance | 8); distance_arr[1] = distance; + GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); } if (distance == (UInt32)0xFFFFFFFF) { @@ -451,6 +520,7 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) } } +__attribute__((noinline)) static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) { do @@ -523,6 +593,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS ((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; unsigned offs = 0x100; unsigned symbol = 1; + unsigned offs_selection[2]; do { unsigned bit; @@ -530,7 +601,10 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS matchByte <<= 1; bit = (matchByte & offs); probLit = prob + offs + bit + symbol; - GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) + + offs_selection[0] = (offs & bit); //False + offs_selection[1] = (offs & ~bit); //True + GET_BIT2_CHECK_AVOID_BRANCH(probLit, symbol, offs, offs_selection) } while (symbol < 0x100); } @@ -722,7 +796,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr SizeT inSize = *srcLen; (*srcLen) = 0; LzmaDec_WriteRem(p, dicLimit); - + *status = LZMA_STATUS_NOT_SPECIFIED; while (p->remainLen != kMatchSpecLenStart) @@ -768,7 +842,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr if (p->needInitState) LzmaDec_InitStateReal(p); - + if (p->tempBufSize == 0) { SizeT processed; @@ -899,12 +973,12 @@ SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size) { UInt32 dicSize; Byte d; - + if (size < LZMA_PROPS_SIZE) return SZ_ERROR_UNSUPPORTED; else dicSize = data[1] | ((UInt32)data[2] << 8) | ((UInt32)data[3] << 16) | ((UInt32)data[4] << 24); - + if (dicSize < LZMA_DIC_MIN) dicSize = LZMA_DIC_MIN; p->dicSize = dicSize; @@ -986,7 +1060,7 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, p.dicBufSize = outSize; LzmaDec_Init(&p); - + *srcLen = inSize; res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status); -- 2.17.1