Created
March 1, 2020 02:46
-
-
Save sujankh/9d004055077b333840637caea270ed17 to your computer and use it in GitHub Desktop.
My patch for LLVM Test Suite's 7zip benchmark branch mis-prediction experiment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 2bec09b4eb46dd823ec93f699eaa2b3acc73ad30 Mon Sep 17 00:00:00 2001 | |
From: Sujan Khadka <skhadka@uber.com> | |
Date: Mon, 29 Apr 2019 00:39:26 -0400 | |
Subject: [PATCH] Lookup tables GET_BIT instead of conditionals | |
--- | |
MultiSource/Benchmarks/7zip/C/LzmaDec.c | 108 ++++++++++++++++++++---- | |
1 file changed, 91 insertions(+), 17 deletions(-) | |
diff --git a/MultiSource/Benchmarks/7zip/C/LzmaDec.c b/MultiSource/Benchmarks/7zip/C/LzmaDec.c | |
index 2036761b..3f808383 100644 | |
--- a/MultiSource/Benchmarks/7zip/C/LzmaDec.c | |
+++ b/MultiSource/Benchmarks/7zip/C/LzmaDec.c | |
@@ -22,7 +22,38 @@ | |
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ | |
{ UPDATE_0(p); i = (i + i); A0; } else \ | |
{ UPDATE_1(p); i = (i + i) + 1; A1; } | |
-#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;) | |
+ | |
+/* | |
+This macro is a re-write of GET_BIT2 but does not use conditional if(code<bound) | |
+This condition is found to cause lots of branch mis-predictions | |
+This condition is replaced into a table-lookup. | |
+upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound | |
+upd_var is the variable to be set to 1th index of upd_vals_table if code<bound | |
+*/ | |
+#define GET_BIT2_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \ | |
+ if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } \ | |
+ bound = (range >> kNumBitModelTotalBits) * ttt; \ | |
+ { \ | |
+ UInt32 code_table[2] = { code - bound, code}; \ | |
+ UInt32 range_table[2] = { range - bound, bound}; \ | |
+ unsigned prob_table[2] = { (ttt - (ttt >> kNumMoveBits)) , \ | |
+ (ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)) \ | |
+ }; \ | |
+ unsigned i_table[2] = { (i + i) + 1, (i + i) }; \ | |
+ int cond = (code < bound); \ | |
+ range = range_table[cond]; \ | |
+ code = code_table[cond]; \ | |
+ *(p) = (CLzmaProb) (prob_table[cond]); \ | |
+ i = i_table[cond]; \ | |
+ upd_var = upd_vals_table[cond]; \ | |
+ } | |
+ | |
+char DUMMY; | |
+char DUMMY_ARRAY[2] = {'0', '1'}; | |
+ | |
+// GET_BIT defaults to calling AVOID_BRANCH given that almost everything that calls it is inside lots of loops | |
+// No variable needs to be updated, so set it to some dummy array (although this might add up one more instruction) | |
+#define GET_BIT(p, i) GET_BIT2_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY) | |
#define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); } | |
#define TREE_DECODE(probs, limit, i) \ | |
@@ -44,6 +75,28 @@ | |
i -= 0x40; } | |
#endif | |
+/* | |
+This macro is a re-write of GET_BIT2_CHECK but does not use conditional if(code<bound) | |
+This condition is found to cause lots of branch mis-predictions | |
+This condition is replaced into a table-lookup. | |
+upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound | |
+upd_var is the variable to be set to 1th index of upd_vals_table if code<bound | |
+*/ | |
+ | |
+#define GET_BIT2_CHECK_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \ | |
+ if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } \ | |
+ bound = (range >> kNumBitModelTotalBits) * ttt; \ | |
+ UInt32 code_table[2] = { code - bound, code}; \ | |
+ UInt32 range_table[2] = { range - bound, bound}; \ | |
+ unsigned symbol_table[2] = { (i + i) + 1, (i + i) }; \ | |
+ int cond = (code < bound); \ | |
+ range = range_table[cond]; \ | |
+ code = code_table[cond]; \ | |
+ i = symbol_table[cond]; \ | |
+ upd_var = upd_vals_table[cond]; | |
+ | |
+#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY) | |
+ | |
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } | |
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) | |
@@ -52,7 +105,7 @@ | |
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ | |
{ UPDATE_0_CHECK; i = (i + i); A0; } else \ | |
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; } | |
-#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) | |
+ | |
#define TREE_DECODE_CHECK(probs, limit, i) \ | |
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } | |
@@ -128,6 +181,7 @@ Out: | |
= kMatchSpecLenStart + 2 : State Init Marker | |
*/ | |
+__attribute__((noinline)) | |
static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte *bufLimit) | |
{ | |
CLzmaProb *probs = p->probs; | |
@@ -141,7 +195,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
Byte *dic = p->dic; | |
SizeT dicBufSize = p->dicBufSize; | |
SizeT dicPos = p->dicPos; | |
- | |
+ | |
UInt32 processedPos = p->processedPos; | |
UInt32 checkDicSize = p->checkDicSize; | |
unsigned len = 0; | |
@@ -179,6 +233,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
unsigned offs = 0x100; | |
state -= (state < 10) ? 3 : 6; | |
symbol = 1; | |
+ unsigned offs_arr[2]; | |
do | |
{ | |
unsigned bit; | |
@@ -186,7 +241,9 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
matchByte <<= 1; | |
bit = (matchByte & offs); | |
probLit = prob + offs + bit + symbol; | |
- GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) | |
+ offs_arr[0] = (offs & bit); // False | |
+ offs_arr[1] = (offs & ~bit); // True | |
+ GET_BIT2_AVOID_BRANCH(probLit, symbol, offs, offs_arr) | |
} | |
while (symbol < 0x100); | |
} | |
@@ -309,10 +366,14 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
{ | |
UInt32 mask = 1; | |
unsigned i = 1; | |
+ UInt32 distance_arr[2]; | |
do | |
{ | |
- GET_BIT2(prob + i, i, ; , distance |= mask); | |
- mask <<= 1; | |
+ distance_arr[0] = (distance | mask); //False | |
+ distance_arr[1] = distance; //True = no update | |
+ | |
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); | |
+ mask <<= 1; | |
} | |
while (--numDirectBits != 0); | |
} | |
@@ -324,7 +385,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
{ | |
NORMALIZE | |
range >>= 1; | |
- | |
+ | |
{ | |
UInt32 t; | |
code -= range; | |
@@ -344,12 +405,20 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte | |
while (--numDirectBits != 0); | |
prob = probs + Align; | |
distance <<= kNumAlignBits; | |
+ UInt32 distance_arr[2]; | |
{ | |
unsigned i = 1; | |
- GET_BIT2(prob + i, i, ; , distance |= 1); | |
- GET_BIT2(prob + i, i, ; , distance |= 2); | |
- GET_BIT2(prob + i, i, ; , distance |= 4); | |
- GET_BIT2(prob + i, i, ; , distance |= 8); | |
+ distance_arr[0] = (distance | 1); distance_arr[1] = distance; | |
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); | |
+ | |
+ distance_arr[0] = (distance | 2); distance_arr[1] = distance; | |
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); | |
+ | |
+ distance_arr[0] = (distance | 4); distance_arr[1] = distance; | |
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); | |
+ | |
+ distance_arr[0] = (distance | 8); distance_arr[1] = distance; | |
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr); | |
} | |
if (distance == (UInt32)0xFFFFFFFF) | |
{ | |
@@ -451,6 +520,7 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) | |
} | |
} | |
+__attribute__((noinline)) | |
static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) | |
{ | |
do | |
@@ -523,6 +593,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS | |
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; | |
unsigned offs = 0x100; | |
unsigned symbol = 1; | |
+ unsigned offs_selection[2]; | |
do | |
{ | |
unsigned bit; | |
@@ -530,7 +601,10 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS | |
matchByte <<= 1; | |
bit = (matchByte & offs); | |
probLit = prob + offs + bit + symbol; | |
- GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) | |
+ | |
+ offs_selection[0] = (offs & bit); //False | |
+ offs_selection[1] = (offs & ~bit); //True | |
+ GET_BIT2_CHECK_AVOID_BRANCH(probLit, symbol, offs, offs_selection) | |
} | |
while (symbol < 0x100); | |
} | |
@@ -722,7 +796,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr | |
SizeT inSize = *srcLen; | |
(*srcLen) = 0; | |
LzmaDec_WriteRem(p, dicLimit); | |
- | |
+ | |
*status = LZMA_STATUS_NOT_SPECIFIED; | |
while (p->remainLen != kMatchSpecLenStart) | |
@@ -768,7 +842,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr | |
if (p->needInitState) | |
LzmaDec_InitStateReal(p); | |
- | |
+ | |
if (p->tempBufSize == 0) | |
{ | |
SizeT processed; | |
@@ -899,12 +973,12 @@ SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size) | |
{ | |
UInt32 dicSize; | |
Byte d; | |
- | |
+ | |
if (size < LZMA_PROPS_SIZE) | |
return SZ_ERROR_UNSUPPORTED; | |
else | |
dicSize = data[1] | ((UInt32)data[2] << 8) | ((UInt32)data[3] << 16) | ((UInt32)data[4] << 24); | |
- | |
+ | |
if (dicSize < LZMA_DIC_MIN) | |
dicSize = LZMA_DIC_MIN; | |
p->dicSize = dicSize; | |
@@ -986,7 +1060,7 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, | |
p.dicBufSize = outSize; | |
LzmaDec_Init(&p); | |
- | |
+ | |
*srcLen = inSize; | |
res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status); | |
-- | |
2.17.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment