Skip to content

Instantly share code, notes, and snippets.

@sujankh
Created March 1, 2020 02:46
Show Gist options
  • Save sujankh/9d004055077b333840637caea270ed17 to your computer and use it in GitHub Desktop.
Save sujankh/9d004055077b333840637caea270ed17 to your computer and use it in GitHub Desktop.
My patch for LLVM Test Suite's 7zip benchmark branch mis-prediction experiment
From 2bec09b4eb46dd823ec93f699eaa2b3acc73ad30 Mon Sep 17 00:00:00 2001
From: Sujan Khadka <skhadka@uber.com>
Date: Mon, 29 Apr 2019 00:39:26 -0400
Subject: [PATCH] Lookup tables GET_BIT instead of conditionals
---
MultiSource/Benchmarks/7zip/C/LzmaDec.c | 108 ++++++++++++++++++++----
1 file changed, 91 insertions(+), 17 deletions(-)
diff --git a/MultiSource/Benchmarks/7zip/C/LzmaDec.c b/MultiSource/Benchmarks/7zip/C/LzmaDec.c
index 2036761b..3f808383 100644
--- a/MultiSource/Benchmarks/7zip/C/LzmaDec.c
+++ b/MultiSource/Benchmarks/7zip/C/LzmaDec.c
@@ -22,7 +22,38 @@
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
{ UPDATE_0(p); i = (i + i); A0; } else \
{ UPDATE_1(p); i = (i + i) + 1; A1; }
-#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;)
+
+/*
+This macro is a re-write of GET_BIT2 but does not use conditional if(code<bound)
+This condition is found to cause lots of branch mis-predictions
+This condition is replaced into a table-lookup.
+upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound
+upd_var is the variable to be set to 1th index of upd_vals_table if code<bound
+*/
+#define GET_BIT2_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \
+ if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } \
+ bound = (range >> kNumBitModelTotalBits) * ttt; \
+ { \
+ UInt32 code_table[2] = { code - bound, code}; \
+ UInt32 range_table[2] = { range - bound, bound}; \
+ unsigned prob_table[2] = { (ttt - (ttt >> kNumMoveBits)) , \
+ (ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)) \
+ }; \
+ unsigned i_table[2] = { (i + i) + 1, (i + i) }; \
+ int cond = (code < bound); \
+ range = range_table[cond]; \
+ code = code_table[cond]; \
+ *(p) = (CLzmaProb) (prob_table[cond]); \
+ i = i_table[cond]; \
+ upd_var = upd_vals_table[cond]; \
+ }
+
+char DUMMY;
+char DUMMY_ARRAY[2] = {'0', '1'};
+
+// GET_BIT defaults to calling AVOID_BRANCH given that almost everything that calls it is inside lots of loops
+// No variable needs to be updated, so set it to some dummy array (although this might add up one more instruction)
+#define GET_BIT(p, i) GET_BIT2_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY)
#define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); }
#define TREE_DECODE(probs, limit, i) \
@@ -44,6 +75,28 @@
i -= 0x40; }
#endif
+/*
+This macro is a re-write of GET_BIT2_CHECK but does not use conditional if(code<bound)
+This condition is found to cause lots of branch mis-predictions
+This condition is replaced into a table-lookup.
+upd_var is the variable to be set to 0th index of upd_vals_table if code>=bound
+upd_var is the variable to be set to 1th index of upd_vals_table if code<bound
+*/
+
+#define GET_BIT2_CHECK_AVOID_BRANCH(p, i, upd_var, upd_vals_table) ttt = *(p); \
+ if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } \
+ bound = (range >> kNumBitModelTotalBits) * ttt; \
+ UInt32 code_table[2] = { code - bound, code}; \
+ UInt32 range_table[2] = { range - bound, bound}; \
+ unsigned symbol_table[2] = { (i + i) + 1, (i + i) }; \
+ int cond = (code < bound); \
+ range = range_table[cond]; \
+ code = code_table[cond]; \
+ i = symbol_table[cond]; \
+ upd_var = upd_vals_table[cond];
+
+#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK_AVOID_BRANCH(p, i, DUMMY, DUMMY_ARRAY)
+
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); }
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound)
@@ -52,7 +105,7 @@
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
{ UPDATE_0_CHECK; i = (i + i); A0; } else \
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; }
-#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
+
#define TREE_DECODE_CHECK(probs, limit, i) \
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }
@@ -128,6 +181,7 @@ Out:
= kMatchSpecLenStart + 2 : State Init Marker
*/
+__attribute__((noinline))
static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{
CLzmaProb *probs = p->probs;
@@ -141,7 +195,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
Byte *dic = p->dic;
SizeT dicBufSize = p->dicBufSize;
SizeT dicPos = p->dicPos;
-
+
UInt32 processedPos = p->processedPos;
UInt32 checkDicSize = p->checkDicSize;
unsigned len = 0;
@@ -179,6 +233,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
unsigned offs = 0x100;
state -= (state < 10) ? 3 : 6;
symbol = 1;
+ unsigned offs_arr[2];
do
{
unsigned bit;
@@ -186,7 +241,9 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
matchByte <<= 1;
bit = (matchByte & offs);
probLit = prob + offs + bit + symbol;
- GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit)
+ offs_arr[0] = (offs & bit); // False
+ offs_arr[1] = (offs & ~bit); // True
+ GET_BIT2_AVOID_BRANCH(probLit, symbol, offs, offs_arr)
}
while (symbol < 0x100);
}
@@ -309,10 +366,14 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
{
UInt32 mask = 1;
unsigned i = 1;
+ UInt32 distance_arr[2];
do
{
- GET_BIT2(prob + i, i, ; , distance |= mask);
- mask <<= 1;
+ distance_arr[0] = (distance | mask); //False
+ distance_arr[1] = distance; //True = no update
+
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
+ mask <<= 1;
}
while (--numDirectBits != 0);
}
@@ -324,7 +385,7 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
{
NORMALIZE
range >>= 1;
-
+
{
UInt32 t;
code -= range;
@@ -344,12 +405,20 @@ static int MY_FAST_CALL LzmaDec_DecodeReal(CLzmaDec *p, SizeT limit, const Byte
while (--numDirectBits != 0);
prob = probs + Align;
distance <<= kNumAlignBits;
+ UInt32 distance_arr[2];
{
unsigned i = 1;
- GET_BIT2(prob + i, i, ; , distance |= 1);
- GET_BIT2(prob + i, i, ; , distance |= 2);
- GET_BIT2(prob + i, i, ; , distance |= 4);
- GET_BIT2(prob + i, i, ; , distance |= 8);
+ distance_arr[0] = (distance | 1); distance_arr[1] = distance;
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
+
+ distance_arr[0] = (distance | 2); distance_arr[1] = distance;
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
+
+ distance_arr[0] = (distance | 4); distance_arr[1] = distance;
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
+
+ distance_arr[0] = (distance | 8); distance_arr[1] = distance;
+ GET_BIT2_AVOID_BRANCH(prob + i, i, distance , distance_arr);
}
if (distance == (UInt32)0xFFFFFFFF)
{
@@ -451,6 +520,7 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
}
}
+__attribute__((noinline))
static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{
do
@@ -523,6 +593,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)];
unsigned offs = 0x100;
unsigned symbol = 1;
+ unsigned offs_selection[2];
do
{
unsigned bit;
@@ -530,7 +601,10 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
matchByte <<= 1;
bit = (matchByte & offs);
probLit = prob + offs + bit + symbol;
- GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
+
+ offs_selection[0] = (offs & bit); //False
+ offs_selection[1] = (offs & ~bit); //True
+ GET_BIT2_CHECK_AVOID_BRANCH(probLit, symbol, offs, offs_selection)
}
while (symbol < 0x100);
}
@@ -722,7 +796,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
SizeT inSize = *srcLen;
(*srcLen) = 0;
LzmaDec_WriteRem(p, dicLimit);
-
+
*status = LZMA_STATUS_NOT_SPECIFIED;
while (p->remainLen != kMatchSpecLenStart)
@@ -768,7 +842,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
if (p->needInitState)
LzmaDec_InitStateReal(p);
-
+
if (p->tempBufSize == 0)
{
SizeT processed;
@@ -899,12 +973,12 @@ SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size)
{
UInt32 dicSize;
Byte d;
-
+
if (size < LZMA_PROPS_SIZE)
return SZ_ERROR_UNSUPPORTED;
else
dicSize = data[1] | ((UInt32)data[2] << 8) | ((UInt32)data[3] << 16) | ((UInt32)data[4] << 24);
-
+
if (dicSize < LZMA_DIC_MIN)
dicSize = LZMA_DIC_MIN;
p->dicSize = dicSize;
@@ -986,7 +1060,7 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
p.dicBufSize = outSize;
LzmaDec_Init(&p);
-
+
*srcLen = inSize;
res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status);
--
2.17.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment